{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 前言"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**本文参照特征工程这本书，完整地完成一套流程**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 420,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:02.667955Z",
     "start_time": "2020-08-05T13:41:02.311931Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:02.672674Z",
     "start_time": "2020-08-05T13:41:02.669992Z"
    }
   },
   "outputs": [],
   "source": [
    "# 过滤掉一些警告\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.293303Z",
     "start_time": "2020-08-05T13:41:02.674186Z"
    }
   },
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 307,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.492222Z",
     "start_time": "2020-08-05T13:41:03.296006Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier, \n",
    "                              GradientBoostingClassifier, ExtraTreesClassifier)\n",
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.naive_bayes import BernoulliNB, GaussianNB\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.tree import DecisionTreeClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 306,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 305,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple\n",
      "Collecting xgboost\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/7c/32/a11befbb003e0e6b7e062a77f010dfcec0ec3589be537b02d2eb2ff93b9a/xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)\n",
      "\u001b[K     |████████████████████████████████| 127.6 MB 17 kB/s  eta 0:00:01��████▎                       | 33.1 MB 366 kB/s eta 0:04:18kB/s eta 0:03:43      | 58.6 MB 426 kB/s eta 0:02:42/s eta 0:02:46:01:35 | 105.6 MB 423 kB/s eta 0:00:52��████████▊    | 110.5 MB 3.2 MB/s eta 0:00:06��███████████████████▌   | 113.7 MB 4.8 MB/s eta 0:00:03    |█████████████████████████████▌  | 117.5 MB 443 kB/s eta 0:00:230 MB 425 kB/s eta 0:00:169 MB 358 kB/s eta 0:00:11███████▍| 125.2 MB 358 kB/s eta 0:00:07\n",
      "\u001b[?25hRequirement already satisfied: numpy in /root/jupyter env/lib64/python3.6/site-packages (from xgboost) (1.19.1)\n",
      "Requirement already satisfied: scipy in /root/jupyter env/lib64/python3.6/site-packages (from xgboost) (1.5.2)\n",
      "Installing collected packages: xgboost\n",
      "Successfully installed xgboost-1.1.1\n"
     ]
    }
   ],
   "source": [
    "!pip install xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.498289Z",
     "start_time": "2020-08-05T13:41:03.494842Z"
    }
   },
   "outputs": [],
   "source": [
    "# 准备进行网格搜索\n",
    "# CV表示交叉验证 cross validation\n",
    "from sklearn.model_selection import GridSearchCV  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.508179Z",
     "start_time": "2020-08-05T13:41:03.501625Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.preprocessing import StandardScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.517313Z",
     "start_time": "2020-08-05T13:41:03.511202Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.527479Z",
     "start_time": "2020-08-05T13:41:03.520913Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.541393Z",
     "start_time": "2020-08-05T13:41:03.530171Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['test.csv',\n",
       " 'README.md',\n",
       " 'gender_submission.csv',\n",
       " '1. Titanic数据预处理.ipynb',\n",
       " '.ipynb_checkpoints',\n",
       " 'train.csv',\n",
       " '参照特征工程入门这本书.ipynb',\n",
       " '.git',\n",
       " '参考另一个.ipynb']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.listdir()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.568663Z",
     "start_time": "2020-08-05T13:41:03.544444Z"
    }
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv('train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.583368Z",
     "start_time": "2020-08-05T13:41:03.571728Z"
    }
   },
   "outputs": [],
   "source": [
    "test = pd.read_csv('test.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据预处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这部分主要完成数据的缺失值填充"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.609627Z",
     "start_time": "2020-08-05T13:41:03.585705Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.635628Z",
     "start_time": "2020-08-05T13:41:03.611393Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   PassengerId  891 non-null    int64  \n",
      " 1   Survived     891 non-null    int64  \n",
      " 2   Pclass       891 non-null    int64  \n",
      " 3   Name         891 non-null    object \n",
      " 4   Sex          891 non-null    object \n",
      " 5   Age          714 non-null    float64\n",
      " 6   SibSp        891 non-null    int64  \n",
      " 7   Parch        891 non-null    int64  \n",
      " 8   Ticket       891 non-null    object \n",
      " 9   Fare         891 non-null    float64\n",
      " 10  Cabin        204 non-null    object \n",
      " 11  Embarked     889 non-null    object \n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.7+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.651659Z",
     "start_time": "2020-08-05T13:41:03.638299Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId    0.000000\n",
       "Survived       0.000000\n",
       "Pclass         0.000000\n",
       "Name           0.000000\n",
       "Sex            0.000000\n",
       "Age            0.198653\n",
       "SibSp          0.000000\n",
       "Parch          0.000000\n",
       "Ticket         0.000000\n",
       "Fare           0.000000\n",
       "Cabin          0.771044\n",
       "Embarked       0.002245\n",
       "dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.isnull().mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "****\n",
    "从这里可以看出 Age Cabin Embarked是有缺失值的\n",
    "\n",
    "所以需要填充这些缺失值。但Cabin的缺失值太多，我们先丢弃这一列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.664277Z",
     "start_time": "2020-08-05T13:41:03.653838Z"
    }
   },
   "outputs": [],
   "source": [
    "train.drop('Cabin', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Age 是定量数据\n",
    "\n",
    "Embarked 是定类数据\n",
    "\n",
    "- 定类数据用众数填充缺失值\n",
    "- 定量数据用中位数或者平均数填充缺失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.679752Z",
     "start_time": "2020-08-05T13:41:03.667427Z"
    }
   },
   "outputs": [],
   "source": [
    "train['Age'].fillna(train.Age.median(), inplace=True)\n",
    "train['Embarked'].fillna(train.Embarked.mode()[0], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.700248Z",
     "start_time": "2020-08-05T13:41:03.682181Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId    0.0\n",
       "Survived       0.0\n",
       "Pclass         0.0\n",
       "Name           0.0\n",
       "Sex            0.0\n",
       "Age            0.0\n",
       "SibSp          0.0\n",
       "Parch          0.0\n",
       "Ticket         0.0\n",
       "Fare           0.0\n",
       "Embarked       0.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# abs可以发现缺失值填充完成\n",
    "train.isnull().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.709359Z",
     "start_time": "2020-08-05T13:41:03.703119Z"
    }
   },
   "outputs": [],
   "source": [
    "columns_list = train.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:03.717212Z",
     "start_time": "2020-08-05T13:41:03.711429Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['PassengerId',\n",
       " 'Survived',\n",
       " 'Pclass',\n",
       " 'Name',\n",
       " 'Sex',\n",
       " 'Age',\n",
       " 'SibSp',\n",
       " 'Parch',\n",
       " 'Ticket',\n",
       " 'Fare',\n",
       " 'Embarked']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "columns_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征理解"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征等级说明"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "****\n",
    "**这一部分主要是对数据类型进行理解，数据首先可以分成 定性数据和定量数据**\n",
    "\n",
    "**定量数据：本质上是数值，描述一个特征的数量**\n",
    "\n",
    "**定性数据：本质上是类别，描述一个特征的种类**\n",
    "****"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**定性数据：pclass，name，sex，ticked，pclass**\n",
    "\n",
    "**定量数据：age，sibsp，parch，fare**\n",
    "****"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "然而还可以继续进行分类，定性数据还可以分成：\n",
    "\n",
    "- 定类等级\n",
    "- 定序等级\n",
    "- 定距等级\n",
    "- 定比等级\n",
    "\n",
    "定类等级：如性别，就是分成男性和女性\n",
    "\n",
    "定序等级：pclass，虽然也是类别，但我们希望某些类别数值越大越好。如不喜欢，一般，喜欢三个层度\n",
    "\n",
    "定距等级：主要是针对定量数据。但和定序数据类似。不仅可以对数据进行排序，还可以数据进行加减。如温度。这类数据要用直方图查看\n",
    "\n",
    "定比等级：定比等级还可以进行乘除运算。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "****\n",
    "定类等级：survived，name，sex，ticked\n",
    "\n",
    "定序等级：pclass\n",
    "\n",
    "定距等级：sibsp，parch\n",
    "\n",
    "定比等级：age，fare"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征等级的图像描述方法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**定距等级和定比等级可以用直方图来描述。不能用饼状图或者是条形图来描述**\n",
    "\n",
    "**定类等级和定序等级可以使用条形图描述**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.075644Z",
     "start_time": "2020-08-05T13:41:03.718884Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([ 54.,  46., 177., 346., 118.,  70.,  45.,  24.,   9.,   2.]),\n",
       " array([ 0.42 ,  8.378, 16.336, 24.294, 32.252, 40.21 , 48.168, 56.126,\n",
       "        64.084, 72.042, 80.   ]),\n",
       " <BarContainer object of 10 artists>)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAARsUlEQVR4nO3df6xl5V3v8fdHQFqpKSDnTsaZiYerow0aO3CPSNPGVLAWqHEw0QZilDQkownNbW+ae++giVojCU20aBMlGQU7Nb1Q7A+ZUPyBUxJTk4IHOqUzTLkdy1RmMjCnP6CtjaTQr3/sZ+x2ODPnxz777D2P71eystd61lp7fc9Z63zOOs9ea51UFZKkvnzXpAuQJK09w12SOmS4S1KHDHdJ6pDhLkkdOnvSBQBcdNFFNTs7O+kyJOmM8uijj36pqmYWmzcV4T47O8v8/Pyky5CkM0qSL55qnt0yktQhw12SOmS4S1KHlgz3JK9I8kiSzyQ5kOTdrf39SZ5Ksq8N21p7krwvyaEkjye5bMxfgyTpJMv5QPUF4Mqq+kaSc4BPJvnrNu9/V9WHT1r+GmBrG34SuKO9SpLWyZJn7jXwjTZ5ThtO97Sx7cAH2nqfAs5PsnH0UiVJy7WsPvckZyXZBxwHHqyqh9usW1vXy+1Jzm1tm4Cnh1Y/0tpOfs8dSeaTzC8sLKz+K5Akvcyywr2qXqqqbcBm4PIkPwbcArwG+AngQuD/rmTDVbWrquaqam5mZtFr8CVJq7Siq2Wq6jngIeDqqjrWul5eAP4cuLwtdhTYMrTa5tYmSVonS36gmmQG+FZVPZfklcCbgPck2VhVx5IEuA7Y31bZA7w9yT0MPkh9vqqOjad8rbfZnR+fyHYP3/aWiWxXOlMt52qZjcDuJGcxONO/t6ruT/KJFvwB9gG/3pZ/ALgWOAR8E3jbmlctSTqtJcO9qh4HLl2k/cpTLF/AzaOXJklaLe9QlaQOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SerQkuGe5BVJHknymSQHkry7tV+c5OEkh5J8KMl3t/Zz2/ShNn92zF+DJOkkyzlzfwG4sqpeC2wDrk5yBfAe4Paq+iHgq8BNbfmbgK+29tvbcpKkdbRkuNfAN9rkOW0o4Ergw619N3BdG9/epmnzr0qStSpYkrS0ZfW5JzkryT7gOPAg8M/Ac1X1YlvkCLCpjW8CngZo858Hvm+R99yRZD7J/MLCwkhfhCTpP1tWuFfVS1W1DdgMXA68ZtQNV9WuqpqrqrmZmZlR306SNGRFV8tU1XPAQ8DrgPOTnN1mbQaOtvGjwBaANv/VwJfXolhJ0vIs52qZmSTnt/FXAm8CDjII+V9si90I3NfG97Rp2vxPVFWtYc2SpCWcvfQibAR2JzmLwS+De6vq/iRPAPck+T3g08Cdbfk7gb9Icgj4CnD9GOqWJJ3GkuFeVY8Dly7S/gUG/e8nt/8b8EtrUp0kaVW8Q1WSOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ0uGe5ItSR5K8kSSA0ne0dp/J8nRJPvacO3QOrckOZTkySRvHucXIEl6ubOXscyLwLuq6rEk3ws8muTBNu/2qvr94YWTXAJcD/wo8P3A3yf54ap6aS0LlySd2pJn7lV1rKoea+NfBw4Cm06zynbgnqp6oaqeAg4Bl69FsZKk5VlRn3uSWeBS4OHW9PYkjye5K8kFrW0T8PTQakdY5JdBkh1J5pPMLywsrLxySdIpLTvck7wK+Ajwzqr6GnAH8IPANuAY8Acr2XBV7aqquaqam5mZWcmqkqQlLCvck5zDINg/WFUfBaiqZ6vqpar6NvCnfKfr5SiwZWj1za1NkrROlnO1TIA7gYNV9d6h9o1Di/0CsL+N7wGuT3JukouBrcAja1eyJGkpy7la5vXArwCfTbKvtf0GcEOSbUABh4FfA6iqA0nuBZ5gcKXNzV4pI0nra8lwr6pPAllk1gOnWedW4NYR6pIkjcA7VCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6tGS4J9mS5KEkTyQ5kOQdrf3CJA8m+Xx7vaC1J8n7khxK8niSy8b9RUiS/rPlnLm/CLyrqi4BrgBuTnIJsBPYW1Vbgb1tGuAaYGsbdgB3rHnVkqTTWjLcq+pYVT3Wxr8OHAQ2AduB3W2x3cB1bXw78IEa+BRwfpKNa124JOnUVtTnnmQWuBR4GNhQVcfarGeADW18E/D00GpHWtvJ77UjyXyS+YWFhZXWLUk6jWWHe5JXAR8B3llVXxueV1UF1Eo2XFW7qmququZmZmZWsqokaQnLCvck5zAI9g9W1Udb87Mnulva6/HWfhTYMrT65tYmSVony7laJsCdwMGqeu/QrD3AjW38RuC+ofZfbVfNXAE8P9R9I0laB2cvY5nXA78CfDbJvtb2G8BtwL1JbgK+CLy1zXsAuBY4BHwTeNtaFixJWtqS4V5VnwRyitlXLbJ8ATePWJckaQTeoSpJHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nq0HLuUNWUmd358UmXIGnKeeYuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4tGe5J7kpyPMn+obbfSXI0yb42XDs075Ykh5I8meTN4ypcknRqyzlzfz9w9SLtt1fVtjY8AJDkEuB64EfbOn+S5Ky1KlaStDxLhntV/QPwlWW+33bgnqp6oaqeAg4Bl49QnyRpFUbpc397ksdbt80FrW0T8PTQMkda28sk2ZFkPsn8wsLCCGVIkk622nC/A/hBYBtwDPiDlb5BVe2qqrmqmpuZmVllGZKkxawq3Kvq2ap6qaq+Dfwp3+l6OQpsGVp0c2uTJK2jVYV7ko1Dk78AnLiSZg9wfZJzk1wMbAUeGa1ESdJKLfk/VJPcDbwRuCjJEeC3gTcm2QYUcBj4NYCqOpDkXuAJ4EXg5qp6aSyVS5JOaclwr6obFmm+8zTL3wrcOkpRkqTReIeqJHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtSh5Z8KqQ0DWZ3fnxi2z5821smtm1ptTxzl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ0uGe5K7khxPsn+o7cIkDyb5fHu9oLUnyfuSHEryeJLLxlm8JGlxyzlzfz9w9UltO4G9VbUV2NumAa4BtrZhB3DH2pQpSVqJJcO9qv4B+MpJzduB3W18N3DdUPsHauBTwPlJNq5RrZKkZVptn/uGqjrWxp8BNrTxTcDTQ8sdaW0vk2RHkvkk8wsLC6ssQ5K0mJE/UK2qAmoV6+2qqrmqmpuZmRm1DEnSkNWG+7Mnulva6/HWfhTYMrTc5tYmSVpHqw33PcCNbfxG4L6h9l9tV81cATw/1H0jSVonSz7yN8ndwBuBi5IcAX4buA24N8lNwBeBt7bFHwCuBQ4B3wTeNoaaJUlLWDLcq+qGU8y6apFlC7h51KIkSaPxDlVJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdWjJB4dJ/9XN7vz4RLZ7+La3TGS76oNn7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOjXQTU5LDwNeBl4AXq2ouyYXAh4BZ4DDw1qr66mhlntqkbjABbzKRNL3W4sz9p6tqW1XNtemdwN6q2grsbdOSpHU0jm6Z7cDuNr4buG4M25Akncao4V7A3yV5NMmO1rahqo618WeADYutmGRHkvkk8wsLCyOWIUkaNuqDw95QVUeT/DfgwSSfG55ZVZWkFluxqnYBuwDm5uYWXUaStDojhXtVHW2vx5N8DLgceDbJxqo6lmQjcHwN6pxKk/wwV5JOZ9XhnuQ84Luq6utt/GeB3wX2ADcCt7XX+9aiUOm/Gh81rFGMcua+AfhYkhPv8/+q6m+S/BNwb5KbgC8Cbx29TEnSSqw63KvqC8BrF2n/MnDVKEVJkkbjHaqS1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktShUZ8KKakz/nezPnjmLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQNzFJmhr+U/C145m7JHXIcJekDo0t3JNcneTJJIeS7BzXdiRJLzeWcE9yFvDHwDXAJcANSS4Zx7YkSS83rg9ULwcOVdUXAJLcA2wHnhjT9iRp1Xp8Eua4wn0T8PTQ9BHgJ4cXSLID2NEmv5HkyVVs5yLgS6uqcLysa+WmtTbrWplprQumtLa8Z6S6fuBUMyZ2KWRV7QJ2jfIeSearam6NSloz1rVy01qbda3MtNYF01vbuOoa1weqR4EtQ9ObW5skaR2MK9z/Cdia5OIk3w1cD+wZ07YkSScZS7dMVb2Y5O3A3wJnAXdV1YExbGqkbp0xsq6Vm9barGtlprUumN7axlJXqmoc7ytJmiDvUJWkDhnuktShMzLcp+nRBknuSnI8yf6htguTPJjk8+31ggnUtSXJQ0meSHIgyTumobYkr0jySJLPtLre3dovTvJw26cfah/Er7skZyX5dJL7p6yuw0k+m2RfkvnWNg3H2flJPpzkc0kOJnndpOtK8iPt+3Ri+FqSd066rlbb/2rH/f4kd7efh7EcY2dcuE/how3eD1x9UttOYG9VbQX2tun19iLwrqq6BLgCuLl9nyZd2wvAlVX1WmAbcHWSK4D3ALdX1Q8BXwVuWue6TngHcHBoelrqAvjpqto2dE30pPclwB8Bf1NVrwFey+B7N9G6qurJ9n3aBvwP4JvAxyZdV5JNwP8E5qrqxxhcbHI94zrGquqMGoDXAX87NH0LcMuEa5oF9g9NPwlsbOMbgSen4Pt2H/CmaaoN+B7gMQZ3L38JOHuxfbyO9Wxm8EN/JXA/kGmoq237MHDRSW0T3ZfAq4GnaBdmTEtdJ9Xys8A/TkNdfOfO/QsZXKl4P/DmcR1jZ9yZO4s/2mDThGo5lQ1VdayNPwNsmGQxSWaBS4GHmYLaWtfHPuA48CDwz8BzVfViW2RS+/QPgf8DfLtNf9+U1AVQwN8lebQ9ugMmvy8vBhaAP29dWX+W5LwpqGvY9cDdbXyidVXVUeD3gX8BjgHPA48ypmPsTAz3M0oNfh1P7HrTJK8CPgK8s6q+NjxvUrVV1Us1+JN5M4OHzL1mvWs4WZKfA45X1aOTruUU3lBVlzHojrw5yU8Nz5zQvjwbuAy4o6ouBf6Vk7o6Jnn8t77rnwf+8uR5k6ir9fFvZ/BL8fuB83h5l+6aORPD/Ux4tMGzSTYCtNfjkygiyTkMgv2DVfXRaaoNoKqeAx5i8Kfo+UlO3FQ3iX36euDnkxwG7mHQNfNHU1AX8B9nfVTVcQb9x5cz+X15BDhSVQ+36Q8zCPtJ13XCNcBjVfVsm550XT8DPFVVC1X1LeCjDI67sRxjZ2K4nwmPNtgD3NjGb2TQ372ukgS4EzhYVe+dltqSzCQ5v42/ksHnAAcZhPwvTqquqrqlqjZX1SyDY+oTVfXLk64LIMl5Sb73xDiDfuT9THhfVtUzwNNJfqQ1XcXgsd4TP/6bG/hOlwxMvq5/Aa5I8j3t5/PE92s8x9ikPugY8YOJa4H/z6Cv9jcnXMvdDPrPvsXgTOYmBn21e4HPA38PXDiBut7A4M/Ox4F9bbh20rUBPw58utW1H/it1v7fgUeAQwz+jD53gvv0jcD901JXq+EzbThw4pif9L5sNWwD5tv+/Cvggimp6zzgy8Crh9qmoa53A59rx/5fAOeO6xjz8QOS1KEzsVtGkrQEw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR16N8BQZBZmXkrZTkAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(train.Age)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**数据形状类似于正太分布，其实只要数据大到一定层度。按统计学规律。是一定会呈现出正太分布的形式的**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.321835Z",
     "start_time": "2020-08-05T13:41:04.077268Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([732., 106.,  31.,   2.,  11.,   6.,   0.,   0.,   0.,   3.]),\n",
       " array([  0.     ,  51.23292, 102.46584, 153.69876, 204.93168, 256.1646 ,\n",
       "        307.39752, 358.63044, 409.86336, 461.09628, 512.3292 ]),\n",
       " <BarContainer object of 10 artists>)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAARKElEQVR4nO3df6zddX3H8edrVNChsfy4Nk3brBgbDX+Mym5YjWZRiAbQWP5QIjGjIU26P9iC0UTLlmwx2R/4jyjJQtaIsyxORJTQIFG7gln2B+hFkF+VcSWQtin0yqBOiW7oe3+cT/VQ295ze38c7qfPR3JyPt/353Pu9/Ohh1e//dzvuTdVhSSpL3807glIkhae4S5JHTLcJalDhrskdchwl6QOrRj3BADOPffcWr9+/binIUnLyoMPPvizqpo4Vt9rItzXr1/P1NTUuKchSctKkmeP1+e2jCR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdeg18QnV+Vi//dtjO/czN3xwbOeWpBPxyl2SOmS4S1KHDHdJ6pDhLkkdMtwlqUOzhnuStyd5eOjx8ySfSHJ2kt1JnmrPZ7XxSXJTkukkjyS5cPGXIUkaNmu4V9WTVbWxqjYCfwa8DNwJbAf2VNUGYE87BrgM2NAe24CbF2HekqQTmOu2zCXAT6vqWWAzsLPVdwJXtPZm4NYauB9YmWT1QkxWkjSauYb7x4CvtfaqqjrY2s8Bq1p7DbBv6DX7W+1VkmxLMpVkamZmZo7TkCSdyMjhnuR04MPAN47uq6oCai4nrqodVTVZVZMTE8f8/a6SpJM0lyv3y4AfVdXz7fj5I9st7flQqx8A1g29bm2rSZKWyFzC/Sp+vyUDsAvY0tpbgLuG6le3u2Y2AYeHtm8kSUtgpB8cluRM4P3AXw2VbwBuT7IVeBa4stXvAS4HphncWXPNgs1WkjSSkcK9qn4JnHNU7QUGd88cPbaAaxdkdpKkk+InVCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOjRTuSVYmuSPJT5LsTfKuJGcn2Z3kqfZ8VhubJDclmU7ySJILF3cJkqSjjXrl/kXgO1X1DuACYC+wHdhTVRuAPe0Y4DJgQ3tsA25e0BlLkmY1a7gneTPwF8AtAFX1v1X1ErAZ2NmG7QSuaO3NwK01cD+wMsnqBZ63JOkERrlyPw+YAf4lyUNJvpTkTGBVVR1sY54DVrX2GmDf0Ov3t9qrJNmWZCrJ1MzMzMmvQJL0B0YJ9xXAhcDNVfVO4Jf8fgsGgKoqoOZy4qraUVWTVTU5MTExl5dKkmYxSrjvB/ZX1QPt+A4GYf/8ke2W9nyo9R8A1g29fm2rSZKWyKzhXlXPAfuSvL2VLgGeAHYBW1ptC3BXa+8Crm53zWwCDg9t30iSlsCKEcf9DfDVJKcDTwPXMPiL4fYkW4FngSvb2HuAy4Fp4OU2VpK0hEYK96p6GJg8RtclxxhbwLXzm5YkaT78hKokdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUoZHCPckzSR5N8nCSqVY7O8nuJE+157NaPUluSjKd5JEkFy7mAiRJf2guV+7vq6qNVXXkF2VvB/ZU1QZgTzsGuAzY0B7bgJsXarKSpNHMZ1tmM7CztXcCVwzVb62B+4GVSVbP4zySpDkaNdwL+F6SB5Nsa7VVVXWwtZ8DVrX2GmDf0Gv3t9qrJNmWZCrJ1MzMzElMXZJ0PCtGHPeeqjqQ5C3A7iQ/Ge6sqkpSczlxVe0AdgBMTk7O6bWSpBMb6cq9qg6050PAncBFwPNHtlva86E2/ACwbujla1tNkrREZg33JGcmedORNvAB4DFgF7ClDdsC3NXau4Cr210zm4DDQ9s3kqQlMMq2zCrgziRHxv9bVX0nyQ+B25NsBZ4Frmzj7wEuB6aBl4FrFnzWkqQTmjXcq+pp4IJj1F8ALjlGvYBrF2R2kqST4idUJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA6NHO5JTkvyUJK72/F5SR5IMp3k60lOb/Uz2vF061+/SHOXJB3HXK7crwP2Dh1/Drixqt4GvAhsbfWtwIutfmMbJ0laQiOFe5K1wAeBL7XjABcDd7QhO4ErWntzO6b1X9LGS5KWyKhX7l8APg38th2fA7xUVa+04/3AmtZeA+wDaP2H23hJ0hKZNdyTfAg4VFUPLuSJk2xLMpVkamZmZiG/tCSd8ka5cn838OEkzwC3MdiO+SKwMsmKNmYtcKC1DwDrAFr/m4EXjv6iVbWjqiaranJiYmJei5Akvdqs4V5V11fV2qpaD3wMuLeqPg7cB3ykDdsC3NXau9oxrf/eqqoFnbUk6YTmc5/7Z4BPJplmsKd+S6vfApzT6p8Ets9vipKkuVox+5Dfq6rvA99v7aeBi44x5lfARxdgbpKkk+QnVCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1KFZwz3J65P8IMmPkzye5LOtfl6SB5JMJ/l6ktNb/Yx2PN361y/yGiRJRxnlyv3XwMVVdQGwEbg0ySbgc8CNVfU24EVgaxu/FXix1W9s4yRJS2jWcK+BX7TD17VHARcDd7T6TuCK1t7cjmn9lyTJQk1YkjS7kfbck5yW5GHgELAb+CnwUlW90obsB9a09hpgH0DrPwycc4yvuS3JVJKpmZmZeS1CkvRqI4V7Vf2mqjYCa4GLgHfM98RVtaOqJqtqcmJiYr5fTpI0ZE53y1TVS8B9wLuAlUlWtK61wIHWPgCsA2j9bwZeWIjJSpJGM8rdMhNJVrb2G4D3A3sZhPxH2rAtwF2tvasd0/rvrapawDlLkmaxYvYhrAZ2JjmNwV8Gt1fV3UmeAG5L8o/AQ8AtbfwtwL8mmQb+G/jYIsxbknQCs4Z7VT0CvPMY9acZ7L8fXf8V8NEFmZ0k6aT4CVVJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ7OGe5J1Se5L8kSSx5Nc1+pnJ9md5Kn2fFarJ8lNSaaTPJLkwsVehCTp1Ua5cn8F+FRVnQ9sAq5Ncj6wHdhTVRuAPe0Y4DJgQ3tsA25e8FlLkk5o1nCvqoNV9aPW/h9gL7AG2AzsbMN2Ale09mbg1hq4H1iZZPVCT1ySdHxz2nNPsh54J/AAsKqqDrau54BVrb0G2Df0sv2tdvTX2pZkKsnUzMzMXOctSTqBkcM9yRuBbwKfqKqfD/dVVQE1lxNX1Y6qmqyqyYmJibm8VJI0i5HCPcnrGAT7V6vqW638/JHtlvZ8qNUPAOuGXr621SRJS2SUu2UC3ALsrarPD3XtAra09hbgrqH61e2umU3A4aHtG0nSElgxwph3A38JPJrk4Vb7W+AG4PYkW4FngStb3z3A5cA08DJwzUJOWJI0u1nDvar+E8hxui85xvgCrp3nvCRJ8+AnVCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOzRruSb6c5FCSx4ZqZyfZneSp9nxWqyfJTUmmkzyS5MLFnLwk6dhGuXL/CnDpUbXtwJ6q2gDsaccAlwEb2mMbcPPCTFOSNBcrZhtQVf+RZP1R5c3Ae1t7J/B94DOtfmtVFXB/kpVJVlfVwQWb8WvI+u3fHst5n7nhg2M5r6Tl42T33FcNBfZzwKrWXgPsGxq3v9UkSUto3t9QbVfpNdfXJdmWZCrJ1MzMzHynIUkacrLh/nyS1QDt+VCrHwDWDY1b22p/oKp2VNVkVU1OTEyc5DQkScdysuG+C9jS2luAu4bqV7e7ZjYBh3vdb5ek17JZv6Ga5GsMvnl6bpL9wD8ANwC3J9kKPAtc2YbfA1wOTAMvA9cswpwlSbMY5W6Zq47TdckxxhZw7XwnJUmaHz+hKkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6NOuP/NVrz7h+MTf4y7ml5cIrd0nqkOEuSR0y3CWpQ4a7JHXIcJekDi3K3TJJLgW+CJwGfKmqbliM8+jUMa47hLw7SMvVgod7ktOAfwLeD+wHfphkV1U9sdDnkhabt51quVqMK/eLgOmqehogyW3AZsBw78A4w05aLD3+Jb4Y4b4G2Dd0vB/486MHJdkGbGuHv0jy5Eme71zgZyf52uXoVFrvqbRWOGq9+dwYZ7L4TqU/2xOudZ5/zn9yvI6xfUK1qnYAO+b7dZJMVdXkAkxpWTiV1nsqrRVOrfW61sW3GHfLHADWDR2vbTVJ0hJZjHD/IbAhyXlJTgc+BuxahPNIko5jwbdlquqVJH8NfJfBrZBfrqrHF/o8Q+a9tbPMnErrPZXWCqfWel3rIktVjeO8kqRF5CdUJalDhrskdWhZh3uSS5M8mWQ6yfZxz2e+knw5yaEkjw3Vzk6yO8lT7fmsVk+Sm9raH0ly4fhmPndJ1iW5L8kTSR5Pcl2r97re1yf5QZIft/V+ttXPS/JAW9fX200IJDmjHU+3/vVjXcBJSHJakoeS3N2Oe17rM0keTfJwkqlWG+t7edmG+9CPObgMOB+4Ksn5453VvH0FuPSo2nZgT1VtAPa0Yxise0N7bANuXqI5LpRXgE9V1fnAJuDa9ufX63p/DVxcVRcAG4FLk2wCPgfcWFVvA14EtrbxW4EXW/3GNm65uQ7YO3Tc81oB3ldVG4fuaR/ve7mqluUDeBfw3aHj64Hrxz2vBVjXeuCxoeMngdWtvRp4srX/GbjqWOOW4wO4i8HPI+p+vcAfAz9i8MntnwErWv1372kGd5u9q7VXtHEZ99znsMa1DALtYuBuIL2utc37GeDco2pjfS8v2yt3jv1jDtaMaS6LaVVVHWzt54BVrd3N+ts/w98JPEDH623bFA8Dh4DdwE+Bl6rqlTZkeE2/W2/rPwycs6QTnp8vAJ8GftuOz6HftQIU8L0kD7YfrQJjfi/7C7KXkaqqJF3du5rkjcA3gU9U1c+T/K6vt/VW1W+AjUlWAncC7xjvjBZHkg8Bh6rqwSTvHfN0lsp7qupAkrcAu5P8ZLhzHO/l5Xzlfqr8mIPnk6wGaM+HWn3Zrz/J6xgE+1er6lut3O16j6iql4D7GGxNrExy5CJreE2/W2/rfzPwwtLO9KS9G/hwkmeA2xhszXyRPtcKQFUdaM+HGPzFfRFjfi8v53A/VX7MwS5gS2tvYbA3faR+dfvO+ybg8NA/AV/zMrhEvwXYW1WfH+rqdb0T7YqdJG9g8P2FvQxC/iNt2NHrPfLf4SPAvdU2aF/rqur6qlpbVesZ/H95b1V9nA7XCpDkzCRvOtIGPgA8xrjfy+P+RsQ8v4lxOfBfDPYu/27c81mA9XwNOAj8H4N9uK0M9h73AE8B/w6c3caGwd1CPwUeBSbHPf85rvU9DPYpHwEebo/LO17vnwIPtfU+Bvx9q78V+AEwDXwDOKPVX9+Op1v/W8e9hpNc93uBu3tea1vXj9vj8SNZNO73sj9+QJI6tJy3ZSRJx2G4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA79P/hRDmLVWiVVAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(train.Fare)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**对定类数据使用条形图来观察**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.485530Z",
     "start_time": "2020-08-05T13:41:04.323886Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD1CAYAAACrz7WZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAMCUlEQVR4nO3dX4il9X3H8fenbkxLU7L+mS52d+0Kbgn2IkYGa0kvWqWtf0rXi0QMpS6ysDcGElJotr0phV7oTW2FIiw1dC1tjKQNLkbSykYJpWgcG2tibOpUtLuLuhOjtkHS1uTbi/ktGcfZnZmdMzPu1/cLhvM8v+d3zvMbGN778Ow5M6kqJEm9/MRmL0CSNHnGXZIaMu6S1JBxl6SGjLskNWTcJamhLZu9AIALL7ywdu3atdnLkKSzypNPPvndqppa6ti7Iu67du1iZmZms5chSWeVJC+e6pi3ZSSpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNfSu+BDT2WLXgS9v9hJaeeH2GzZ7CVJbXrlLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqaEVxT3JC0m+meSpJDNj7PwkDyd5bjyeN8aT5K4ks0meTnLFen4DkqR3Ws2V+69V1eVVNT32DwBHqmo3cGTsA1wH7B5f+4G7J7VYSdLKrOW2zB7g0Ng+BNy4YPzemvcYsDXJRWs4jyRplVYa9wL+McmTSfaPsW1V9dLYfhnYNra3A0cXPPfYGJMkbZCV/pm9X6mq40l+Fng4yb8tPFhVlaRWc+Lxj8R+gIsvvng1T5UkLWNFV+5VdXw8ngC+BFwJvHLydst4PDGmHwd2Lnj6jjG2+DUPVtV0VU1PTU2d+XcgSXqHZeOe5KeT/MzJbeA3gG8Bh4G9Y9pe4IGxfRi4Zbxr5irgjQW3byRJG2Alt2W2AV9KcnL+31bVV5I8AdyfZB/wInDTmP8QcD0wC7wJ3DrxVUuSTmvZuFfV88CHlxh/FbhmifECbpvI6iRJZ8RPqEpSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1tOK4JzknyTeSPDj2L0nyeJLZJF9Icu4Yf//Ynx3Hd63T2iVJp7CaK/dPAc8u2L8DuLOqLgVeA/aN8X3Aa2P8zjFPkrSBVhT3JDuAG4C/HPsBrga+OKYcAm4c23vGPuP4NWO+JGmDrPTK/c+A3wd+NPYvAF6vqrfG/jFg+9jeDhwFGMffGPMlSRtk2bgn+S3gRFU9OckTJ9mfZCbJzNzc3CRfWpLe81Zy5f5R4LeTvADcx/ztmD8HtibZMubsAI6P7ePAToBx/IPAq4tftKoOVtV0VU1PTU2t6ZuQJL3dsnGvqj+oqh1VtQu4GfhqVf0O8AjwsTFtL/DA2D489hnHv1pVNdFVS5JOay3vc/8s8Jkks8zfU79njN8DXDDGPwMcWNsSJUmrtWX5KT9WVY8Cj47t54Erl5jzA+DjE1ibJOkM+QlVSWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ2t6o91SHp32nXgy5u9hFZeuP2GzV7CmnnlLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPLxj3JTyb5epJ/TfJMkj8e45ckeTzJbJIvJDl3jL9/7M+O47vW+XuQJC2ykiv3/wGurqoPA5cD1ya5CrgDuLOqLgVeA/aN+fuA18b4nWOeJGkDLRv3mvf9sfu+8VXA1cAXx/gh4MaxvWfsM45fkySTWrAkaXkruuee5JwkTwEngIeB/wBer6q3xpRjwPaxvR04CjCOvwFcMME1S5KWsaK4V9UPq+pyYAdwJfChtZ44yf4kM0lm5ubm1vpykqQFVvVumap6HXgE+GVga5KTf+xjB3B8bB8HdgKM4x8EXl3itQ5W1XRVTU9NTZ3Z6iVJS1rJu2Wmkmwd2z8F/DrwLPOR/9iYthd4YGwfHvuM41+tqprgmiVJy1jJn9m7CDiU5Bzm/zG4v6oeTPJt4L4kfwJ8A7hnzL8H+Osks8D3gJvXYd2SpNNYNu5V9TTwkSXGn2f+/vvi8R8AH5/I6iRJZ8RPqEpSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1tGzck+xM8kiSbyd5Jsmnxvj5SR5O8tx4PG+MJ8ldSWaTPJ3kivX+JiRJb7eSK/e3gN+rqsuAq4DbklwGHACOVNVu4MjYB7gO2D2+9gN3T3zVkqTTWjbuVfVSVf3L2P5v4FlgO7AHODSmHQJuHNt7gHtr3mPA1iQXTXrhkqRTW9U99yS7gI8AjwPbquqlcehlYNvY3g4cXfC0Y2NMkrRBVhz3JB8A/g74dFX918JjVVVArebESfYnmUkyMzc3t5qnSpKWsaK4J3kf82H/m6r6+zH8ysnbLePxxBg/Duxc8PQdY+xtqupgVU1X1fTU1NSZrl+StISVvFsmwD3As1X1pwsOHQb2ju29wAMLxm8Z75q5Cnhjwe0bSdIG2LKCOR8Ffhf4ZpKnxtgfArcD9yfZB7wI3DSOPQRcD8wCbwK3TnLBkqTlLRv3qvonIKc4fM0S8wu4bY3rkiStgZ9QlaSGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWpo2bgn+VySE0m+tWDs/CQPJ3luPJ43xpPkriSzSZ5OcsV6Ll6StLSVXLn/FXDtorEDwJGq2g0cGfsA1wG7x9d+4O7JLFOStBrLxr2qvgZ8b9HwHuDQ2D4E3Lhg/N6a9xiwNclFE1qrJGmFzvSe+7aqemlsvwxsG9vbgaML5h0bY5KkDbTm/1CtqgJqtc9Lsj/JTJKZubm5tS5DkrTAmcb9lZO3W8bjiTF+HNi5YN6OMfYOVXWwqqaranpqauoMlyFJWsqZxv0wsHds7wUeWDB+y3jXzFXAGwtu30iSNsiW5SYk+Tzwq8CFSY4BfwTcDtyfZB/wInDTmP4QcD0wC7wJ3LoOa5YkLWPZuFfVJ05x6Jol5hZw21oXJUlaGz+hKkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktTQusQ9ybVJvpNkNsmB9TiHJOnUJh73JOcAfwFcB1wGfCLJZZM+jyTp1Nbjyv1KYLaqnq+q/wXuA/asw3kkSaewZR1ecztwdMH+MeCXFk9Ksh/YP3a/n+Q767CW96oLge9u9iKWkzs2ewXaBP5sTtbPn+rAesR9RarqIHBws87fWZKZqpre7HVIi/mzuXHW47bMcWDngv0dY0yStEHWI+5PALuTXJLkXOBm4PA6nEeSdAoTvy1TVW8l+STwD8A5wOeq6plJn0en5e0uvVv5s7lBUlWbvQZJ0oT5CVVJasi4S1JDxl2SGtq097lrMpJ8iPlPAG8fQ8eBw1X17OatStJm88r9LJbks8z/eocAXx9fAT7vL2zTu1mSWzd7Dd35bpmzWJJ/B36xqv5v0fi5wDNVtXtzViadXpL/rKqLN3sdnXlb5uz2I+DngBcXjV80jkmbJsnTpzoEbNvItbwXGfez26eBI0me48e/rO1i4FLgk5u1KGnYBvwm8Nqi8QD/vPHLeW8x7mexqvpKkl9g/tcsL/wP1Seq6oebtzIJgAeBD1TVU4sPJHl0w1fzHuM9d0lqyHfLSFJDxl2SGjLuktSQcZekhoy7JDX0/yEHl9Gwas5jAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "train.Survived.value_counts().plot(kind='bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.635007Z",
     "start_time": "2020-08-05T13:41:04.490162Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:ylabel='Survived'>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPUAAADnCAYAAADGrxD1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAT00lEQVR4nO3de5QcZZ3G8e87k8lEEAYFLyBgidyEyKIIgiwQ8IK77W0lXpaAIrgK6hFxjad0uZSwYAu6KKy6ATcqcUVFFIVaPBiVEBOQyGUhBMHEtCZyk6CdaCKQ5N0/qoeMYTLdM+muX9Vbz+ecOSGDzPuoeahLV/1e571HRMLRZx1ARLpLpRYJjEotEhiVWiQwKrVIYFRqkcCo1CKBUalFAqNSiwRGpRYJjEotEhiVWiQwKrVIYFRqkcCo1CKBUalFAqNSiwRGpRYJjEotEhiVWiQwKrVIYFTqHDnnXu+cu885t9Q5F1vnkTA5jQjOh3OuH7gfeC2wElgE/LP3folpMAmOjtT5OQRY6r3/jff+CeBbwJuNM0mAVOr8vABYMeL3K1vfE+kqlVokMCp1fn4P7Dbi97u2vifSVbpRlhPn3CSyG2WvJivzIuB47/09vVgvitNtgRcC0YhfI2AXYGA4VuuLLfy6Bniw9fVA69cVwDLggUa9pj88BaRS58g594/A54F+YLb3/vyt/ZlRnL4AmAa8jE3FfSGw09b+7DbWAr8hK/i9wAJgfqNea/Z4XWlDpS6ZKE53JSvx8NeLDeNsbiNwFzCv9XVTo15bZRupelTqgovidDfgaOAoshLvYRpofDywhFbBgXmNeu0h20jhU6kLqFXkGa2vqcZxuskDNwJfBa5u1GtrbeOESaUuiChOtwfeAZwAHMGmG1ahWg1cBXy1Ua8tsA4TEpXaWBSnBwOnAu8EtjGOY+XXwNeAKxr12krjLKWnUhuI4nQKcCLwfuAg4zhFshGYC1wGfE8fmU2MSp2jKE4nAScDZ6NHRNv5P+CsRr12rXWQslGpcxDFqSO7Xj4X2Ms4TtncDHyyUa/daB2kLFTqHovi9B+AC4ADjaOU3Vyyci+yDlJ0KnWPRHF6OPBpsjvZ0j3XkJ2WL7YOUlQqdZdFcfoS4ELgDdZZArYR+CYwUw+zPJ1K3SWt6+aPAucDg8ZxquIx4IxGvXaFdZAiUam7IIrT3YGvkz3GKflLgfc36jW9yorep95qUZy+i+wlhmnGUaqsBtzT+v+i8nSknqAoTncEZgHHWWeRv3ElcFqVXwFVqSeg9THVbOD51llkVA1gRqNeW2gdxIJKPQ5RnA4CFwOnWWeRtjYAZzfqtQusg+RNpe5QFKc7AD8AjjSOIuMzm+wm2nrrIHlRqTvQer/5emB/6ywyIT8C3tao1/5sHSQPKnUbUZxOJSv0rtZZZKvcBtQa9drD1kF6TR9pjSGK02nAz1GhQ3AQcHMUp/tYB+k1lXoLojh9O9lp25B1FumaFwELojh9lXWQXlKpRxHF6UfI9rrS457h2RH4SRSnb7UO0isq9WaiOL2I7GOr0GeEVdkU4KooTv/FOkgv6EbZCFGcngecaZ1DcrMROK5Rr11jHaSbVOqWKE4/DHzBOofkbh3w6ka9drN1kG5RqYEoTmcAc9Apd1WtAl7VqNfutw7SDZUvdRSnrwd+yKZN46SalpMVu/RDFypd6taDJQuB7ayzSCHcDhxV9ifPKnv3O4rT5wLXoULLJi8Hvtsa5VxalSx1a5j+NWRbvoqMdCzwFesQW6OSpQYuBw6zDiGF9e4oTj9hHWKiKndNHcXpCWR3ukXG8iRwaKNeu906yHhVqtStDdvvBnYwjiLlsAQ4qFGv/dU6yHhU5vS7NcJ3Niq0dG4/st1VSqUypSYbQfRa6xBSOh+J4vRo6xDjUYnT7yhO9yTbRbGq+z/L1vkdcEBZJpQGf6SO4rSfbNC+Ci0TtTtwqXWITgVfamAmEPRL8ZKLE6M4LcWM96BPv6M4PQBYBEy2ziJBWAVMLfrz4cEeqUfc7VahpVt2BC6yDtFOsKUm2w7nIOsQEpwZUZy+3DrEWIIsdevm2LnWOSRIjoIfrYMsNXAC8BLrEBKsY6I4rVmH2JLgbpRFcToA3Ec2DlakV5YAL23Uaxutg2wuxCP1e1Ghpff2A95pHWI0QR2pozh9BrAU2MU6i1TCr4D9i3a0Du1I/UFUaMnPvhTwaB3MkTqK0+3IhsftaJ1FKqVwR+uQjtSno0JL/vYF/sk6xEhBlDqK0z7gfdY5pLJOtg4wUhClBo4BdrMOIZV1bBSnhbmXE0qpT7IOIJXWD7zLOsSw0pc6itPtKdg1jVTSe6wDDCt9qYG3owEIYm/vomxmH0KpT7IOINJSiKN1qT+nbs0e+7V1DpGWNcDzG/XaWssQZT9Sn2QdQGSE7YDp1iFKW+rWZ9OFueMo0mJ+Cl7aUgNHos+mpXiOau2oaqbMpT7WOoDIKBzZAcdMmUv9GusAIltwlOXipSx1FKfPItsgXKSIVOoJOIbyZpfwTY3i1OyNwbIW4xjrACJjcMARVouXtdSHWwcQaWOa1cKlK3UUp88EplrnEGnD7Lp6zFI759Y451Zv6SuvkJs5mOxVN5EiOyCK0x0sFp401t/03m8H4Jw7D3gQmEN2vTAD2Lnn6UZ3mNG6IuPRR3Zdfa3Fwp14k/f+S977Nd771d77LwNv7mWwMbzSaF2R8TI5AHVa6r8452Y45/qdc33OuRnAX3oZbAx7G60rMl4mm0p0WurjyYYRPNz6elvrexb0vLeUxQstFh3zmnqY976B3en2U6I4fTawrXUOkQ6ZlLqjI7Vzbm/n3E+cc4tbvz/AOXdmb6ONaneDNUUmaucoTifnvWinp9+XA58AngTw3t+FzXYjKrWUicPgz2ynpd7Ge3/rZt9b3+0wHVCppWxyPwXvtNSPOudeDHgA59x0ss+t86ZSS9nkXuqObpSR7SZ5GbCvc+73ZBvRzehZqi1TqaVsClvq33rvX+Oc2xbo896v6WWoMajUUjaFPf1e7py7DDgU+HMP87SjUkvZFLbU+wJzyU7Dlzvn/tM59/e9i7VF2qpWymb7vBfsqNTe+7Xe++94798KvIws6LyeJhMJw0DeC3b8PrVz7ijn3JeA24ApZI+N5q2824lIVeVe6o5ulDnnGsAdwHeAmd57q5c5VGopm2KWGjjAe281FGEklVrKplilds593Ht/IXC+c+5phfLef7hnyUanUned91cM1G/6+77FL7VOEqKNuDXwx1zXbHekvrf16y97HaRDKnUXTWL9k9dPjm/dq+8B0znVIesj/2c62o0zGh7Fcrf3/vYc8rSjUnfJtqxbM2/wjF/v5FZrMmtvbch7wU7vfn/OOXevc+4855zlJE+Vuguex2OPLBr8wAM7udXa5aT3nsh7wU4/pz4aOBr4AzDLOXe30fvUKvVW2tutWP7zwdOf2MY9vo91lopYlfeCHX9O7b1/yHt/CXAqcCdwdq9CjSH3U5mQHN63ePGPJsdDA27DrtZZKqSYpXbOvcQ5lzjn7gYuBRYCFn8wHjJYMwjT++fd+o2BC/boc/7Z1lkq5tG8F+z0c+rZwLeAY733D/QwTzsNYH/D9UvpjElXzf9w//df5Zw2QTCQ+5G6bamdc/3Acu/9F3LI007DOkDZXDpwyY1v7L9lmnWOCivekdp7v8E5t5tzbrL3Pvc7eZtpGK9fGn1s3PC9yecsPLBv2TTrLBVXvCN1y3JggXPuh4wY4u+9/4+epBo7h7QxhcfXzR2cedeu7lGz7VTlKb/Ne8FOS72s9dUHbNe7OG01DNcuhR1Y88d5g2esHHJrtT1RMdyX94KdDvP/VK+DdKhhHaDIdncPr/zx5JlPDLr1eo67GNYCK/Ne1Hnf/nkO59zPGOXBD+/9Mb0INZYoTldje7ZQSAe6pfddPfmcHfqdf551FnnKnSTNl+W9aKen3x8b8ddTgOOwmfsN2TWKNp0f4XV9i+6YNXDxi53Lf3SOjCn3U2/o/PT7ts2+tcA5t/lw/7w0UKmfcnL/9QvPmjTnFc6R+/Yu0lZxS+2cG/kUUh/wCmCoJ4nauwd4g9HahXLepNnzTuife6RzOOssMqolFot2evp9G5uuqdeTHS1P6UWgDiwwWrdAvJ8zUL/piP679R50sS20WLTd5JODgRXe+xe1fv9usuvpBkb/FiL7H8pDNY9OGmxQGitImissFm73QscsWu+DOueOBD4NfB1okm3Dk7tGvbYKo2sVa9uybs0tgx+6e6++BzTYoPjMzijblbrfe/9Y66/fAVzmvb/ae38WsGdvo42pcqfgGmxQOsUttXNu+BT91cBPR/y9Tq/He+Emw7Vzp8EGpWRW6nbFvBKY55x7FFgHzAdwzu1JdgpuZa7h2rk6vG/x4jkDn95F70GXyhrgLqvF2z5R5pw7FNgZuGF4iL9zbm/gmZbDCKM4XUzg71ZP759360WTZk11jm2ss8i4XE3SnG61eCevXt4yyvfu702ccbmBgEutwQal9gPLxTueUVZAP7YO0CuXDlxy4+mTvn+ECl1K64HrLANY3uzaWjeS7ZX9TOMcXaPBBkGYT9LMd0uOzZT2SN2o19YB37bO0S1TeHzd/MHTbzuwb5kGG5TbNdYBSlvqltnWAbrhWax+7NbBDy59gVt1iHUW2Wqm19NQ8lI36rWFwK+sc2yN3d3DK28Z/NCftndrNdig/BaSNHMfX7S5Upe65b+tA0zUgW7pfT+b/NGBQbd+D+ss0hWXWweAMEp9BXYDGybsdX2L7vj+5LN31qSSYDSB71iHgABK3ajXHsH4I4TxOrn/+oWzBi7eX5NKgvI/JM211iEggFK3lOaG2XmTZs87a9KcwzSpJDiFOPWGcn9OPdL/Ag+SPc5aUBpsELBfkjTvtA4xLIgjdaNe20B2bV1IA6x/4seTZy5UoYNVhC2pnhJEqVu+hMEG3+1sy7o1Nw9+aLEGGwRrGdnbjIURTKkb9drvgP+yzjGSBhtUQp2kWah904Mpdcu/kz0Pbk6DDSphBdl4r0IJqtSNeu0PQN6b9j3N4X2LF/9ocjw04Dbsap1FeuozJM0nrUNsLqhSt3wOgz2Bh03vn3frNwYu2EOTSoL3EAV9mjG4UjfqtdVkU09zd8akq+ZfNGnWQZpUUgnnkDT/ah1iNMGVuuWLZNc7udFgg0q5HfiKdYgtCbLUjXrtcSDJY60+Nm64ZvJZ89/Yf8u0PNaTQjidpLnROsSWBFnqlq/T49cyNdigkq4kaf7cOsRYgi116ymzmb36+RpsUElrgY9bh2gn2FIDNOq164CvdvvnarBBZV1A0lxpHaKdoEvdcjqwvFs/TIMNKutO4ELrEJ0IvtSNem0NcCKw1Tc2NNigsp4E3l3EB01GE3ypARr12gLgM1vzMzTYoNLOJWmabaMzXpUodcs5wB0T+Qc12KDSFmD0MNNEtd1LKyRRnO4H3AZM6eyf0GCDilsN/B1Js2EdZDyqdKSmUa8tAeJO/rMabCDAKWUrNFSs1C2X0GYrXA02ELKPr75rHWIiKlfqRr3mgROAUYeua7CBAClwlnWIiarUNfVIUZzuT3YTZGj4e3u7FcvTyZ8c0HvQlXYf8EqSZtM6yERV7kg9rFGv3QNMp7URgAYbCNmNsbeUudBQ4VIDNOq1ucBpGmwgZEMr307SLPXebFDh0++R/DlDiXOcY51DzGwA3lnWG2ObU6mHJUNfBk61jiG582QfXXX9xR8rlT793swHgW9ah5DcfTSkQoNKvUk2yeJEevCqphTWp0ian7cO0W0q9UhZsU8h2+1DwnYmSTOxDtELuqbekmTos8C/WseQrtsInEbSvMw6SK+o1GNJhs6lxE8WydM8DswgaV5tHaSXVOp2kqFTgUsJZ9vfqlpD9mDJT62D9JpK3YlkaBrwXWBH4yQyMSvICn27dZA86EZZJ5LmjcAhwBLjJDJ+PwUOqkqhQaXuXNL8DXAocJ11FOnYhcDrSJp/sA6SJ51+j1cy1Af8G3A2us4uqj8D7wnlsc/xUqknKhl6JfANYE/rKPI37gSOJ2neax3Eik6/Jypp/gI4kAJvlFYx64FzgUOqXGjQkbo7kqG3AJcDOxknqaq7gJNImhOaFhsaHam7IWleA+xHtgm5/i2Zn/XA+cDBKvQmOlJ3WzJ0KNn+2Jpx1ls/IXvDqjRD9vOiUvdCdof8fWRHEU1T6a77gY+RNK+1DlJUKnUvJUM7AWcC76fjDQRkC/5IdiPsi2XZ08qKSp2HZGhnsk0E3ofKPV6ryV6F/SxJc5V1mDJQqfOkco/HI8DFwJfLPt0zbyq1hazcHwDeCzzfOE3RNICLgNkkzb8aZyklldpSMjQAvAU4DTjaNoypjcANZJ/1/5Ckud44T6mp1EWRDO1LNs30eOA5xmnysgyYA3yNpDnqNkgyfip10SRD/cARwHHAW4FdbAN13XKyN92+TdJcYB0mRCp1kSVDjux1z+OANwF72QaakA3AzWRFvpakqXfSe0ylLpNkaDeya++jgcMpZsmfIHtT6hayMt9A0nzMNFHFqNRllgw9BzgMeCnZs+f7AfsAz8gpwVqy6+J7gF+QFfkOkubjW/NDnXOzgTcAj3jvp251yopRqUOTPaL6IrKC7wE8F3he69fhv34OMEj2Qs9oL/WsA/5E9hTXn1pfj5Ht6b0MWAosJWk+2Iv/Cs65I8kGHVyhUo+fSi3DN+eGvzaQNJ8wToRzLgKuU6nHT+N4BJLmBrIbWhIAvU8tEhiVWiQwKrVIYFRqKRzn3JVkn3Hv45xb6Zw7xTpTmejut0hgdKQWCYxKLRIYlVokMCq1SGBUapHAqNQigVGpRQKjUosERqUWCYxKLRIYlVokMCq1SGBUapHAqNQigVGpRQKjUosERqUWCYxKLRIYlVokMCq1SGBUapHAqNQigfl/ORCxT2VBkFAAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "train.Survived.value_counts().plot(kind='pie')"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "train.Sex.value_counts().plot(kind='pie')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征增强"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "观察下Name和Ticket这两个特征。似乎都不能使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.656060Z",
     "start_time": "2020-08-05T13:41:04.638130Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Embarked  \n",
       "0      0         A/5 21171   7.2500        S  \n",
       "1      0          PC 17599  71.2833        C  \n",
       "2      0  STON/O2. 3101282   7.9250        S  \n",
       "3      0            113803  53.1000        S  \n",
       "4      0            373450   8.0500        S  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.663631Z",
     "start_time": "2020-08-05T13:41:04.658283Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "891"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['Name'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.676376Z",
     "start_time": "2020-08-05T13:41:04.664902Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "681"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['Ticket'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.688678Z",
     "start_time": "2020-08-05T13:41:04.680440Z"
    }
   },
   "outputs": [],
   "source": [
    "train['New_name'] = train['Name'].str.extract(', (\\w+).')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.700189Z",
     "start_time": "2020-08-05T13:41:04.691023Z"
    }
   },
   "outputs": [],
   "source": [
    "train.drop('Ticket', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.708793Z",
     "start_time": "2020-08-05T13:41:04.702641Z"
    }
   },
   "outputs": [],
   "source": [
    "new_data_column = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'New_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.718392Z",
     "start_time": "2020-08-05T13:41:04.712351Z"
    }
   },
   "outputs": [],
   "source": [
    "new_data = train[new_data_column]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.737565Z",
     "start_time": "2020-08-05T13:41:04.723010Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>New_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "      <td>Miss</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked New_name\n",
       "0         0       3    male  22.0      1      0   7.2500        S       Mr\n",
       "1         1       1  female  38.0      1      0  71.2833        C      Mrs\n",
       "2         1       3  female  26.0      0      0   7.9250        S     Miss\n",
       "3         1       1  female  35.0      1      0  53.1000        S      Mrs\n",
       "4         0       3    male  35.0      0      0   8.0500        S       Mr"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.748608Z",
     "start_time": "2020-08-05T13:41:04.739084Z"
    }
   },
   "outputs": [],
   "source": [
    "temp = new_data['New_name'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.759612Z",
     "start_time": "2020-08-05T13:41:04.750858Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "517"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp.loc['Mr']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.773722Z",
     "start_time": "2020-08-05T13:41:04.761580Z"
    }
   },
   "outputs": [],
   "source": [
    "new_data.loc[:,'New_name'] = new_data.loc[:,'New_name'].apply(\n",
    "    lambda x:'Misc' if temp[x] < 10 else x\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用10作为分界线，否则认为出现的频率很低，因此归一类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.792121Z",
     "start_time": "2020-08-05T13:41:04.776059Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Mr        517\n",
       "Miss      182\n",
       "Mrs       125\n",
       "Master     40\n",
       "Misc       27\n",
       "Name: New_name, dtype: int64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_data['New_name'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:04.812512Z",
     "start_time": "2020-08-05T13:41:04.794843Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>New_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "      <td>Miss</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked New_name\n",
       "0         0       3    male  22.0      1      0   7.2500        S       Mr\n",
       "1         1       1  female  38.0      1      0  71.2833        C      Mrs\n",
       "2         1       3  female  26.0      0      0   7.9250        S     Miss\n",
       "3         1       1  female  35.0      1      0  53.1000        S      Mrs\n",
       "4         0       3    male  35.0      0      0   8.0500        S       Mr"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 相关性分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.126348Z",
     "start_time": "2020-08-05T13:41:04.814767Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:>"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAgD0lEQVR4nO3df7xVdZ3v8dcbELRI/IEpISOWjAVIJFyNfqI5XeyHdMcfSZo4qeRMNU5zrfFeGyWtJm9lo6VOZI5INqKW46nhhqai5iB5UH5aKvkjQU3RpEFJ5ZzP/LG+BzZ773PO3uy99g94P32sx1n7u75rrc/aLPdnf7/ftddSRGBmZlZoQLMDMDOz1uPkYGZmJZwczMyshJODmZmVcHIwM7MSTg5mZlbCycHMrMVJukrSs5JW9bJcki6VtEbSCkmH1rpPJwczs9Z3NTCtj+VHA2PSNAu4otYdOjmYmbW4iLgLeKGPKtOBayJzL7CHpBG17HNQLSu3k9fWP9pWPwWfcsjMZodQtWlDRjU7hKoNCTU7hKq8+bX2ihdgwpANzQ6haoc89tOa3+hqPnMG7/OWT5N94+8xJyLmVLG7kcCTBa/XprKnq9jGNnaa5GBWrN0Sg+24UiKoJhnkzsnBzCwP3V2N3Ns6oLDpvn8q224eczAzy0PX5sqn2nUAp6Srlt4JbIiI7e5SArcczMxyEdFdt21J+jdgKjBc0lrgfGCXbD/xL8AC4EPAGuBl4K9q3aeTg5lZHrrrlxwiYkY/ywP4TN12iJODmVk+6thyaAYnBzOzPDR2QLrunBzMzPLgloOZmRWL+lyF1DRODmZmeajjgHQzODmYmeXB3UpmZlbCA9JmZlbCLQczMyvR5gPSFd1bSdK5klanJwwtk3R4rTuWdIykc2rdTtrWxnpsx8ysbrq7K59aUL8tB0lTgI8Ah0bEK5KGA4Mr2bikQRFRNn1GRAfZzaLMzHY4Ee095lBJy2EEsD4iXgGIiPUR8ZSkx1OiQNJkSYvS/GxJ8yTdA8yTdK+kcT0bk7Qo1T9V0nclDZP0hKQBafnrJT0paRdJb5H0c0lLJd0t6a2pzoGSFktaKekr9X1LzMzqILorn1pQJcnhFmCUpIclXS7p/RWsMxY4Kt0saj5wAkB6bN2IiOjsqRgRG4BlQM92PwIsjIjXyB5+8bmImAScDVye6lwCXBERh9DHk44kzZLUKanzymv+rYKwzczqZEfvVoqIjZImAe8FjgDmVzBW0BERm9L89WQJ5nyyJHFjmfrzgY8DdwAnApdLGgq8C7hB2vLEriHp77uBY9P8POCiXmLf8nSldntMqJm1uRZtEVSqoquVIus8WwQskrQSmAlsZmvLY9eiVV4qWHedpOclTSBLAGeW2UUH8DVJewGTgNuB1wMvRsTE3sKqJHYzs6boeq3ZEdSk324lSQdLGlNQNBF4Anic7IMctn6L78184IvAsIhYUbwwIjYC95F1F/0sIroi4o/AY5KOT3FI0tvTKveQtTAATurvGMzMGq7Nu5UqGXMYCsyV9KCkFWTjCbOBLwOXSOoE+huWv5Hsw/z6PurMB05Of3ucBJwmaTmwGpieys8CPpNaMSMrOAYzs8Zq8wFpZQ8Q2vG125jDlENmNjuEqk0bMqr/Si1kSKj/Si3mza+1X8wThmxodghVO+Sxn9b8Rv/pnmsr/szZ9d0ntdw/rH8hbWaWhxbtLqpURb+QNjOz6kTXaxVP/ZE0TdJDktaUu1pU0p9JukPSA+lOFh+qNX4nBzOzPNRpzEHSQOAy4GiyMd8ZksYWVfsScH1EvIP0c4Baw3e3kplZHurXrXQYsCYiHgWQdB3ZxTkPFtQJYPc0Pwx4qtadOjmYmeWhiquQJM0CZhUUzUk/4oXsiswnC5atBYpvfjobuEXS58h+I3ZUteEWc3IwM8tDFS2Hwrs5bKcZwNUR8a10s9R5ksZHbP91sk4OZmZ5qN/vF9YBhdeJ75/KCp0GTAOIiMWSdgWGA89u7049IG1mlofNmyuf+nYfMCbdjXow2YBz8eMOfgd8AEDS28huafRcLeG75WBmloc6tRwiYrOkzwILgYHAVRGxWtIFQGd6Ns7/Br4v6fNkg9OnRo2/cHZyMDPLQx1/BBcRC4AFRWXnFcw/SHa36rpxcjAzy0OL3jOpUk4OZmZ5aPPbZ+w0yaHdbmS3eOXcZodQtXZ7jwE+rwOaHUJVXhjY7Aiq94519zc7hKr1O0RcCbcczNpTuyUGazP9X4XU0pwczMzy0OaPQ3ByMDPLg8cczMyshJODmZmV8IC0mZmV6OpqdgQ1cXIwM8uDu5XMzKyEk4OZmZXwmIOZmRWLbv/OwczMirlbyczMSvhqJTMzK+GWg5mZlXByMDOzEr7xXilJXcDKtP1fAzMj4uVe6s4GNkbEN/OIxcysKdq85TAgp+1uioiJETEeeBU4M6f9mJm1pu6ofOqHpGmSHpK0RtI5vdQ5QdKDklZL+lGt4eeVHArdDRwEIOkUSSskLZc0r7iipDMk3ZeW/1jS61L58ZJWpfK7Utk4Sb+StCxtc0wDjsXMrDJdXZVPfZA0ELgMOBoYC8yQNLaozhjg/wDvjohxwN/VGn6uyUHSILIDWilpHPAl4MiIeDtwVplVfhIR/yMt/zVwWio/D/ifqfyYVHYmcElETAQmA2vL7H+WpE5Jnc+9/Ew9D83MrE/R3V3x1I/DgDUR8WhEvApcB0wvqnMGcFlE/AEgIp6tNf68ksNukpYBncDvgB8ARwI3RMR6gIh4ocx64yXdLWklcBIwLpXfA1wt6Qyg5ym6i4H/K+kfgAMiYlPxxiJiTkRMjojJ+7xuvzoenplZP6roVir8IpumWQVbGgk8WfB6bSor9OfAn0u6R9K9kqbVGn5eVyttSt/ot5BUyXpXAx+LiOWSTgWmAkTEmZIOBz4MLJU0KSJ+JGlJKlsg6dMRcXv9DsHMrAZV3FspIuYAc2rY2yBgDNln5v7AXZIOiYgXt3eDjRhz6HE7cLykvQEk7VWmzhuApyXtQtZyINV9S0QsiYjzgOeAUZLeDDwaEZcCNwMTcj8CM7NK1W9Aeh0wquD1/qms0FqgIyJei4jHgIfJksV2a1hyiIjVwFeBOyUtBy4uU+0fgSVk3Ui/KSj/hqSVklYB/wksB04AVqXuq/HANTmGb2ZWnc1dlU99uw8YI+lASYOBE4GOojr/TuppkTScrJvp0VrCz6VbKSKG9lI+F5hbVDa7YP4K4Ioy6/1lmc19PU1mZq2nTrfsjojNkj4LLCQbc70qIlZLugDojIiOtOyDkh4EuoAvRMTztezXv5A2M8tDHW/ZHRELgAVFZecVzAfw92mqCycHM7McVHCJaktzcjAzy4Mf9mNmZiWcHMzMrIQf9mNmZsX8DGkzMyvl5GBmZiV8tZKZmZVwy8HMzEo4OZiZWbHocrdSW5g2ZFT/lVrIlENmNjuEqi1eObf/Si3muEP/ttkhVGWPAUOaHULVrt17arNDaA63HMzaU7slBmsvvpTVzMxKOTmYmVmJ9h5ycHIwM8tDbG7v7ODkYGaWh/bODU4OZmZ58IC0mZmVcsvBzMyKtXvLYUCzAzAz2yF1VzH1Q9I0SQ9JWiPpnD7qHSspJE2uNXy3HMzMchCb67MdSQOBy4C/ANYC90nqiIgHi+q9ATgLWFKP/brlYGaWg+iufOrHYcCaiHg0Il4FrgOml6l3IXAR8Kd6xO/kYGaWh/p1K40Enix4vTaVbSHpUGBURPxHHSIH3K1kZpaLCloEW0iaBcwqKJoTEXMqXHcAcDFwahXh9cvJwcwsB9Ukh5QIeksG64DC20rvn8p6vAEYDyySBLAf0CHpmIjorCLkbTg5mJnlILpUr03dB4yRdCBZUjgR+MSW/URsAIb3vJa0CDi7lsQALTTmIOlj6RKstzY7FjOzWtVrQDoiNgOfBRYCvwauj4jVki6QdExe8bdSy2EG8Mv09/wmx2JmVpPorlvLgYhYACwoKjuvl7pT67HPlmg5SBoKvAc4jazJhKQBki6X9BtJt0paIOm4tGySpDslLZW0UNKIJoZvZlaijpeyNkVLJAeya3Z/HhEPA89LmgT8JTAaGAt8EpgCIGkX4DvAcRExCbgK+Gq5jUqaJalTUuf9/7Um/6MwM0siVPHUilqlW2kGcEmavy69HgTcEBHdwDOS7kjLDyYbmb81jcwPBJ4ut9HCKwC+NPoT7X2jEzNrK63aIqhU05ODpL2AI4FDJAXZh30AN/W2CrA6IqY0KEQzs6p11+9qpaZohW6l44B5EXFARIyOiFHAY8ALwLFp7GFfYGqq/xCwj6Qt3UySxjUjcDOz3kS3Kp5aUdNbDmRdSBcVlf0YeBvZz8QfJPvp+P3Ahoh4NQ1MXyppGNkx/DOwumERm5n1o1U/9CvV9OQQEUeUKbsUsquYImKjpL2BXwEr0/JlwPsaGaeZWTWizUc5m54c+vEzSXsAg4ELI+KZJsdjZlYRtxxyVK8fc5iZNVqrXqJaqZZODmZm7aqrza9WcnIwM8uBWw5mZlbCYw5mZlbCVyuZmVkJtxzMzKxEV3cr3IBi+zk5mJnlwN1KZmZWottXK5mZWTFfympmZiXcrdQmhrRZFv+8Dmh2CFU77tC/bXYIVbvx/kubHUJV/jjzr5odQtXu79yv2SE0RT27lSRNI3sg2kDgyoj4etHyvwdOBzYDzwGfiognatlnew+nm9Wg3RKDtZeu7gEVT32RNBC4DDia7LHJMySNLar2ADA5IiYANwL/r9b4nRzMzHIQVUz9OAxYExGPRsSrZI9Snr7NviLuiIiX08t7gf1rjX+n6VYyM2ukOnYrjSR74FmPtcDhfdQ/Dfj/te7UycHMLAfVXK0kaRYwq6BoTkTMqXafkk4GJgPvr3bdYk4OZmY56K6ibkoEvSWDdcCogtf7p7JtSDoKOBd4f0S8UsXuy/KYg5lZDgJVPPXjPmCMpAMlDQZOBDoKK0h6B/A94JiIeLYe8bvlYGaWg811GnOIiM2SPgssJLuU9aqIWC3pAqAzIjqAbwBDgRskAfwuIo6pZb9ODmZmOaigRVD5tiIWAAuKys4rmD+qbjtLnBzMzHJQzZhDK3JyMDPLQT1bDs3g5GBmlgO3HMzMrESXWw5mZlaszZ8S6uRgZpaH7jZvOTTkR3CSzpW0WtIKScskHS7pyp47C0ra2Mt675S0JK3za0mzGxGvmVmt6njjvabIveUgaQrwEeDQiHhF0nBgcEScXsHqc4ETImJ5um3twXnGamZWL+0+IN2IlsMIYH3PvT4iYn1EPCVpkaTJPZUkfTu1Lm6TtE8qfiPwdFqvKyIeTHVnS5onabGkRySd0YDjMDOrWLdU8dSKGpEcbgFGSXpY0uWSyt0t8PVkPwMfB9wJnJ/Kvw08JOkmSZ+WtGvBOhOAI4EpwHmS3lS8UUmzJHVK6uzcuKauB2Vm1peuKqZWlHtyiIiNwCSy29E+B8yXdGpRtW5gfpr/IfCetO4FZLefvQX4BPDzgnVujohNEbEeuIPsgRjF+54TEZMjYvLkoQfV76DMzPrRrcqnVtSQq5UiogtYBCyStBKY2d8qBev+FrhC0veB5yTtXVynl9dmZk3jq5X6IelgSWMKiiYCxQ++HgAcl+Y/AfwyrfthaUuH3BiyFtiL6fV0SbumZDGV7La2ZmYtwVcr9W8o8B1JewCbgTVkXUw3FtR5CThM0peAZ4GPp/JPAt+W9HJa96SI6Er5YgVZd9Jw4MKIeKoBx2JmVpFW7S6qVO7JISKWAu8qs2hqQZ2hvax7Yh+bXhERp9QWnZlZPtr9Ulb/QtrMLAddbjk0XkTMbnYMZmZ9ccvBzMxKODmYmVmJOj1CummcHMzMctDuLYeG3JXVzGxnU8/bZ0iaJukhSWsknVNm+RBJ89PyJZJG1xq/k4OZWQ7qdfuMdEfqy4CjgbHAjJ7HHRQ4DfhDRBxEdk+6i2qN38nBzCwH3VVM/TgMWBMRj0bEq8B1wPSiOtPJHnEA2Q+MP1Bwd4nt4uRgZpaDapJD4R2k0zSrYFMjgScLXq9NZZSrExGbgQ3A3tTAA9JmZjmo5p5JETEHmJNXLNvDycHMLAd1vLfSOmBUwev9U1m5OmslDQKGAc/XslN3K5mZ5aCOVyvdB4yRdKCkwcCJQEdRnQ62PgrhOOD2iKjphq87Tcvhza+11y9SXhjY7Aiqt8eAIc0OoSqnT/4CF49b3+wwqrL73H9tdghV233C2c0OoSm663Qz7ojYLOmzwEJgIHBVRKyWdAHZEzQ7gB8A8yStAV4gSyA12WmSg1mxdksM1l7q+SO4iFgALCgqO69g/k/A8XXcpZODmVkeWvUhPpVycjAzy0G73z7DycHMLAeb1d5tBycHM7MctHdqcHIwM8uFu5XMzKxEvS5lbRYnBzOzHLR3anByMDPLhbuVzMysRFebtx2cHMzMcuCWg5mZlQi3HMzMrJhbDmZmVqLdL2VtyPMcJHVJWiZplaQbJL2uxu2NlrSqXvGZmdVbVDG1okY97GdTREyMiPHAq8CZlayUnmhkZtZ2NhMVT62oGU+Cuxs4SNJHJS2R9ICkX0jaF0DSbEnzJN1D9vCKfSXdJGl5mt6VtjNQ0vclrZZ0i6TdmnAsZmZlRRX/taKGJofUEjgaWAn8EnhnRLwDuA74YkHVscBRETEDuBS4MyLeDhwKrE51xgCXRcQ44EXg2DL7myWpU1Ln7S8/ktNRmZmV6q5iakWN6rbZTdKyNH832SPtDgbmSxoBDAYeK6jfERGb0vyRwCkAEdEFbJC0J/BYRPRscykwuninETEHmANw7ZtObs30bGY7pFZtEVSqUclhU0RMLCyQ9B3g4ojokDQVmF2w+KUKtvlKwXwX4G4lM2sZrdoiqFQzxhx6DAPWpfmZfdS7DfhrAEkDJQ3LOzAzs1p1RVQ81ULSXpJulfRI+rtnmToTJS1OY7QrJH28v+02MznMBm6QtBTo60nvZwFHSFpJ1n00tgGxmZnVpJuoeKrROcBtETGG7Mv0OWXqvAycksZopwH/LGmPvjbakG6liBhapuxm4OYy5bOLXv8emF5ms+ML6nyz9ijNzOqngWMO04GpaX4usAj4h21iiXi4YP4pSc8C+5BdzFNWM1sOZmY7rGquViq8sjJNs6rY1b4R8XSafwbYt6/Kkg4juwjot33V84/MzMxyUE13UeGVleVI+gWwX5lF5xZtJyT1uuN0deg8YGZE9Dlm7uRgZpaDenYrRcRRvS2T9HtJIyLi6fTh/2wv9XYH/gM4NyLu7W+f7lYyM8tBo65WAjrYesXnTMqM5UoaDNwEXBMRN1ayUScHM7McNPBqpa8DfyHpEeCo9BpJkyVdmeqcALwPODXdBHWZpIl9bdTdSmZmOWjUj+Ai4nngA2XKO4HT0/wPgR9Ws10nBzOzHPj2GWZmVqLdH/bj5GBmloOofaC5qZwczMxy0OWWg5mZFXO3kpmZlXC3UpuYMGRDs0OoyjvW3d/sEKp27d5Tmx1CVX5x//7s1b252WFUZfcJZzc7hKodumLnvC+mWw5mbardEoO1F1/KamZmJepwW4ymcnIwM8uBu5XMzKyEk4OZmZXw1UpmZlbCLQczMyvhq5XMzKxEV99P4Wx5Tg5mZjnwmIOZmZXwmIOZmZXwmIOZmZXobvNupQHNDsDMbEcUVfxXC0l7SbpV0iPp75591N1d0lpJ3+1vu04OZmY56IruiqcanQPcFhFjgNvS695cCNxVyUadHMzMctAdUfFUo+nA3DQ/F/hYuUqSJgH7ArdUstGmJgdJXZKWFUyjmxmPmVm9VNOtJGmWpM6CaVYVu9o3Ip5O88+QJYBtSBoAfAuo+IEgzR6Q3hQRE6tZQZIARbT5L0zMbIdWTYsgIuYAc3pbLukXwH5lFp1btJ2QVG7HfwMsiIi12Udo/5qdHLYhaShwM7AnsAvwpYi4ObUoFgJLgEnAhySdAJwADAFuiojzmxO1mVmpel7KGhFH9bZM0u8ljYiIpyWNAJ4tU20K8F5JfwMMBQZL2hgRvY5PNHvMYbeCLqWbgD8B/ysiDgWOAL6lrWluDHB5RIwDDk6vDwMmApMkva9444VNtRv/64lGHI+ZGQBd0VXxVKMOYGaan0n2BXsbEXFSRPxZRIwm61q6pq/EAM1vOWzTrSRpF+Br6YO+GxjJ1v6zJyLi3jT/wTQ9kF4PJUsW24zCFzbVVh740fa+6NjM2koDb5/xdeB6SacBT5D1qCBpMnBmRJy+PRttdnIodhKwDzApIl6T9Diwa1r2UkE9Af8UEd9rcHxmZhVp1O0zIuJ54ANlyjuBksQQEVcDV/e33WZ3KxUbBjybEsMRwAG91FsIfCqNUSBppKQ3NipIM7P+RETFUytqtZbDtcBPJa0EOoHflKsUEbdIehuwOA1JbAROpvxAjJlZw7X77TOamhwiYmjR6/Vko+rljC+qewlwSU6hmZnVxDfeMzOzEn7Yj5mZlWjVsYRKOTmYmeXAYw5mZlbCLQczMyvhx4SamVkJtxzMzKyEr1YyM7MSHpA2M7MS7lYyM7MS/oW0mZmVcMvBrE29MGAQe3VvbnYYtoNq9zEHtXt2azZJs9JDhdqGY85fu8ULjtm21WrPc2hHs5odwHZwzPlrt3jBMVsBJwczMyvh5GBmZiWcHGrXjv2djjl/7RYvOGYr4AFpMzMr4ZaDmZmVcHIwM7MSO11ykHSupNWSVkhaJunwOmzzGEnn1Cm+jVXU7UrHsErSDZJe10fd2ZLOrkeMeZD0MUkh6a3NjqWccueNpCsljU3Ly/67SXqnpCVpnV9Lmt2geCs+Nyrc3mhJq+oVXwX764m/ZxrdqH1bZqf6hbSkKcBHgEMj4hVJw4HBFa47KCLK/pw2IjqAjvpFWrFNETERQNK1wJnAxU2Iox5mAL9Mf89vcizb6O28iYjTK1h9LnBCRCyXNBA4OM9YC2zXudHXed5gW+KvlCSRjaO2972yW8TO1nIYAayPiFcAImJ9RDwl6fH0PzySJktalOZnS5on6R5gnqR7JY3r2ZikRan+qZK+K2mYpCckDUjLXy/pSUm7SHqLpJ9LWirp7p5vyJIOlLRY0kpJX6nh2O4GDkrbPCV9w10uaV5xRUlnSLovLf9xz7dKScenb5rLJd2VysZJ+lX69rZC0pgaYixL0lDgPcBpwImpbICkyyX9RtKtkhZIOi4tmyTpzvReLpQ0ot4xFentvFkkaXLBcXw7tS5uk7RPKn4j8HRarysiHkx1e86txZIekXRGjvHfDRwk6aOpFfOApF9I2rcolp7zfF9JN6XzYLmkd6XtDJT0/XSMt0jaLceYtyFpaHpf70//r0xP5aMlPSTpGmAVMErSF9L5vULSlxsV4w4nInaaCRgKLAMeBi4H3p/KHweGp/nJwKI0PxtYCuyWXn8e+HKaHwE8lOZPBb6b5m8GjkjzHweuTPO3AWPS/OHA7Wm+AzglzX8G2FjF8WxMfwel/f41MC4dX8/x7FVwLGen+b0LtvEV4HNpfiUwMs3vkf5+BzgpzQ/ueS/q/O9yEvCDNP+fwCTgOGAB2ReY/YA/pLJdUp19Ct7jq5p03iwCJqf5KHifzis4H85Lsd8EfBrYteDfYzmwGzAceBJ4Ux1jLndu7MnWKxRPB77Vy3k+H/i7ND8QGAaMBjYDE1P59cDJOb7nXek9X5beu0HA7mnZcGANoBRXN/DOtOyDZJe3Kp07PwPel+f5saNOO1W3UkRslDQJeC9wBDBf/Y8VdETEpjR/PXALWbfHCcCNZerPJ/vAuoPsW/Dl6Zvxu4AbspYvAEPS33cDx6b5ecBFVRzSbpKWpfm7gR+QfQDdEBHrASLihTLrjU+tlD3IPvgWpvJ7gKslXQ/8JJUtBs6VtD/wk4h4pIr4KjUDuCTNX5deD0rH0Q08I+mOtPxgYDxwa3ovB5K+meelwvOmm+zfHuCHpPcvIi5I3TofBD5BdmxTU72b07m1KR3fYcC/1ynscufGwSn2EWSJ/rGC+oXn+ZHAKSn+LmCDpD2BxyKiZ5tLyT6Y87JNt5KkXYCvSXof2Xs9Etg3LX4iIu5N8x9M0wPp9VBgDHBXjrHukHaq5ABbTvZFwCJJK4GZZN+IerrYdi1a5aWCdddJel7SBLIEcGaZXXSQncR7kX0Dvh14PfBi9N6Hur0/Ninply1IPn25GvhYZP3gp5I+rCLiTGUD9B8GlkqaFBE/krQklS2Q9OmIuH074y2R3qcjgUMkBdmHfZB9Wyy7CrA6IqbUK4ZK9HLe9LlKwbq/Ba6Q9H3gOUl7F9fp5XUtyp0b3wEujogOSVPJWgw9XqJ/rxTMd5G1ehrlJGAfYFJEvCbpcbb+v1oYu4B/iojvNTC2HdJONeYg6eCiPvOJwBNk3UqTUtmx9G0+8EVgWESsKF4YERuB+8i+Cf8ssn7mPwKPSTo+xSFJb0+r3EPqZyf7H6BWtwPH93wApQ/fYm8Ank7fxrbsU9JbImJJRJwHPEfWf/tm4NGIuJSse2JCHWIsdBwwLyIOiIjRETGK7BvtC8CxaexhX7Z+234I2EfZIDHKxnPGldtwvfRx3hQaQHYskLUQfpnW/bC2ZuwxZB+qL6bX0yXtmv6tppKdN3kaBqxL830lt9vIuqGQNFDSsJzjqsQw4NmUGI4ADuil3kLgU6m1jqSRkt7YqCB3JDtVciBrYs6V9KCkFcBYsm9PXwYukdRJ9j9vX24k+zC/vo8684GT2drNANmH8GmSlgOrgemp/CzgM+nb6MjqDqdURKwGvgrcmfZV7gqVfwSWkCWm3xSUfyMN9q0i69dfTtZ9tip1UYwHrqk1xiIzKG0l/JhsnGEt8CBZN839wIaIeJXsQ/iidHzLyLrs8tTbeVPoJeCw9N4dCVyQyj8JPJTev3lk4xI959gKsu7He4ELI+KpXI8ii/kGSUuB9X3UOws4Ip2TS8mOt9muBSanmE5h2/N2i4i4BfgRsDjVvZHsy5BVybfPsJYlaWjq798b+BXw7oh4ptlx1YOy3ztsjIhvNjsWs3J2ujEHays/k7QH2eDphTtKYjBrB245mJlZiZ1tzMHMzCrg5GBmZiWcHMzMrISTg5mZlXByMDOzEv8NrlHy9PZ2ya0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.heatmap(new_data.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.132831Z",
     "start_time": "2020-08-05T13:41:05.128464Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',\n",
       "       'Embarked', 'New_name'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.148358Z",
     "start_time": "2020-08-05T13:41:05.134215Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Survived    1.000000\n",
       "Pclass     -0.338481\n",
       "Age        -0.064910\n",
       "SibSp      -0.035322\n",
       "Parch       0.081629\n",
       "Fare        0.257307\n",
       "Name: Survived, dtype: float64"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_data.corr()['Survived']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "为什么少了这么多变量？不是还有性别和姓名这些属性吗。怎么没显示出来\n",
    "\n",
    "这是因为这些数据都是字符串类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.167304Z",
     "start_time": "2020-08-05T13:41:05.151058Z"
    }
   },
   "outputs": [],
   "source": [
    "# 将定类数据进行one hot编码。也就是哑变量\n",
    "# 因为我们不希望这些分类数据有强度之分，也就是不喜欢，喜欢这样的差别\n",
    "train1 = pd.get_dummies(new_data, columns=['Sex', 'Embarked', 'New_name'])\n",
    "del new_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.722577Z",
     "start_time": "2020-08-05T13:41:05.169811Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:>"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcAAAAFQCAYAAAAoQ64wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAABPpUlEQVR4nO2dd7xdRbXHv780CAQITTqEEpAiBIgUsdBRHwJKkaKAj+oD8elDiihEEMGCiGABpYsURR+RhxSB0AkEhCT0AEE6BAgQApHcu94fMyd35+SUve/sW86565vP/mTv2TNrzzn33rPOzKz5LZkZjuM4jjPQGNTXHXAcx3GcvsAdoOM4jjMgcQfoOI7jDEjcATqO4zgDEneAjuM4zoDEHaDjOI4zIHEH6DiO4/QKki6Q9JqkqXXuS9IvJU2TNFnSJpl7B0h6Kh4HlNEfd4CO4zhOb3ER8NkG9z8HjI7HocBvACQtBZwEbA5sBpwkacnUzrgDdBzHcXoFM7sdeLNBlV2BSyxwLzBS0grATsBNZvammb0F3ERjR5qLIakGnN7lwxnPJEv3nL7p95Pab/XBh6ldYCF1JttYZuR76TZGz05qP3z7dZP7cNDPX0m28bMV30lq//Lziyf3YXrHosk2PrZEo8/GfMyePSyp/RrbzEruw6ETFku28bvt03+/Fz//JqXaKPKZM2zZNQ8jjNwqnGdm5xV43ErA85nrF2JZvfIk3AE6juM4pRCdXRGH16e4A3Qcx3Hq05E+41OAF4FVMtcrx7IXga2ryiekPszXAB3HcZz6dHbmP9IZD+wfo0G3AN42s5eBG4AdJS0Zg192jGVJDKgRoKQTgH2BDqATOMzMJiba3AVYz8xOL6F/s8xsRKodx3GcsjArxbEBIOlywkhuGUkvECI7h4bn2G+B64DPA9OA2cDX4r03JZ0C3B9NnWxmyQvGA8YBStoS2BnYxMzmSFoGyLVaLmmImc2tdc/MxhO+tTiO47Qf5YzsADCzfZrcN+CIOvcuAC4orTMMrCnQFYAZZjYHwMxmmNlLkqZHZ4iksZImxPNxki6VdBdwqaR7Ja1fMSZpQqx/oKRzJC0h6TlJg+L9RSU9L2mopDUlXS/pAUl3SPporLO6pHskTZH0w15+PxzHcZpjnfmPFmMgOcAbgVUkPSnp15I+k6PNesD28VvLlcBeAHFfygpmNqlS0czeBh4CKnZ3Bm4wsw8JUVHfMLNNgaOBX8c6ZwG/MbOPAS/X64SkQyVNkjTp95dcnv8VO47jpNLZkf9oMQbMFKiZzZK0KfApYBvgSknHNWk23szej+dXEZzoSQRH+Oca9a8EvgzcCuwN/FrSCOATwJ+keVtyFor/bwXsHs8vBX5cp+/zQovL2AfoOI6Tm46aqz9twYBxgABm1kEInZ0gaQpwADCXrpHwwlVN3su0fVHSG5I2JDi5w2s8YjzwoyjbsylwC7AoMNPMxtTrVvdejeM4Ts9TZhBMf2PATIFKWkfS6EzRGOA5YDrBWUHXaKweVwLHAEuY2eTqm2Y2ixCldBZwrZl1mNk7wLOS9oz9kKSNYpO7CCNFgP0KvyjHcZyepne3QfQqA8YBAiOAiyU9KmkyYX1vHPAD4CxJkwjbIxrxZ4LDuqpBnSuBr8T/K+wHHCTpYeARgt4dwDeBI+JoNFnWx3Ecp3TaOAhmwEyBmtkDhLW4au4A1q5Rf1yNslepes/M7CKCwnnl+s+Aquo8Sw3h1li+Zaboe/VfgeM4Th/QgsEteVHYduG0Cqestl/yD+y4B05Jap8qpg2w1/C+Fz0GOF1pWsH7zlkkuQ8bLf96so3r31gu2cbuH32+eaUGvPFcuhi2BqV/Ht3z7tJJ7dcnXYR6uRXSxMkB7ntx+WQbe7x8WbIY9pxHbs79Q1lo/e2Sn9ebDJgRoOM49Ul1fk4b04JTm3lxB+g4juPUpwWDW/LiDtBxHMepS9g91p64A3Qcx3Hq41OgAxNJHcAUwvv0GHCAmdVMIS5pHDDLzH7Wez10HMfpYdp4CnQg7QPsDu+b2Rgz2wD4N7XVXxzHcdqXjg/zHy2GO8D83AGsBSBpf0mTJT0s6dLqipIOkXR/vH+1pEVi+Z6Spsby22PZ+pLuk/RQtDm62p7jOE6f0cYb4d0B5kDSEOBzwJSYEul7wLZmthFBzaWav5jZx+P9x4CDYvmJwE6xfJdYdjhwVtQKHQu8UOP587JBTJo1rcyX5jiO0xiXQhuwDJf0EDAJ+BdwPrAt8CczmwEhU3GNdhvEvH9TCDJolTyCdwEXSToEGBzL7gG+K+lYYLVM9ol5mNl5ZjbWzMaOHbFWiS/PcRynCW08AvQgmMa8X53FQfmUQy4CdjOzhyUdCGwNYGaHS9oc+A/gAUmbmtkfJU2MZddJOszMbinvJTiO4yTQgiO7vPgIsDi3AHtKWhogpj6qZjHgZUlDyWR5kLSmmU00sxOB1wkJetcAnjGzXwLXABv2+CtwHMfJS4lToJI+K+kJSdNq5WOVdGaMh3goJi+fmbnXkbk3voyX5iPAgpjZI5JOBW6L2yT+CRxYVe37wESCk5tIcIgAP41BLgJuBh4GjgW+KulD4BXgRz3+IhzHcXJiJUV3ShoM/ArYgRDrcL+k8Wb26LxnmX0rU/8bwMYZEwvMyKXiDrABZjaiTvnFwMVVZeMy578BflOj3ZdqmDs9Ho7jOP2P8tb2NgOmmdkzAJKuIKSGe7RO/X2Ak8p6eC3cAbYYW32Q/m0sNZtDajYJgIvGnJhsY0RneuaAU0fOSGr/7AcLJfdhzgfpf4ZbDErLPvDklGWT+zBscLpk1jVDhyfb+PqqLyW1/9sLKyb3YdeFZybb2HLUy8k2SqHAGqCkQ4FDM0Xnmdl58XwlIKu6/gKweR07qwGrE5acKiwc87bOBU43s//N3bE6uAN0HMdx6lNgBBid3XlNKzZnb+DPNr8Q6Wpm9mKMm7hF0hQzezrlIR4E4ziO49SnvCCYF4FVMtcrx7Ja7A1cni0wsxfj/88AE5h/fbBbuAN0HMdx6tMxN//RmPuB0ZJWlzSM4OQWiOaU9FFgScIe6UrZkpIWiufLAFtRf+0wN+4AS0bSbpIs/hAdx3Fam5JGgGY2FzgSuIGgkHVVjKo/WdIumap7A1eYWXaRf11gkqSHgVsJa4DJDtDXAMtnH+BOeiGCyXEcp8cpUeHFzK4DrqsqO7HqelyNdncDHyutIxEfAZaIpBHAJwnan3vHskGSfi3pcUk3SbpO0h7x3qaSbpP0gKQbJK3Qh913HMdZENcCdXKyK3C9mT0JvCFpU+BLwChgPeCrwJYAUSXmbGAPM9sUuAA4tZbRrBj2te8/0/OvwnEcp4JrgTo52Qc4K55fEa+HEMSzO4FXJN0a768DbADcFPVFBwM1N/5kQ4tvWW6v9M1vjuM4eWnBkV1e3AGWRNQE3Rb4mCQjODQD/lqvCfCImW3ZS110HMcpTvPozpbFp0DLYw/gUjNbzcxGmdkqwLPAm8DucS1wOWJmCOAJYFlJ86ZEY65Bx3Gc/kMbrwH6CLA89gF+XFV2NSF89wXCnpXngQeBt83s3zEY5peSliD8LH4BPNJrPXYcx2mGte+qizvAkjCzbWqU/RJCdKiZzYoplO4DpsT7DwGf7s1+Oo7jFKIFR3Z5cQfYO1wraSQwDDjFzF7p4/44juPkwx2gk4KZbV2WrYWU/su41/A3k9qXkcnhwIdOTrYx58f/k2xj5u1p2TXWHPxWch8ee23pZBuLkhaoMGxQ+u/ViEXmJNs4Zuv3km1M+VutHNX52XbJtAwhAH99bqVkG9ssnPZ3CiH9QjItuL0hL+4AHcdxnPp0pKe56q+4A3Qcx3Hq41OgjuM4zoCkjR2g7wMsgKQTJD0iabKkhyRtLun3ktaL92fVabeFpImxzWOSxvVqxx3HcbqLS6E5ccP6zsAmZjYn5qQaZmYH52h+MbCXmT0saTBBBs1xHKffY53tuw/QR4D5WQGYYWZzAMxshpm9JGmCpLGVSpLOjKPEmyUtG4s/QtT5NLOOSh4rSeMkXSrpHklPSTqkl1+T4zhOY8pLiNvvcAeYnxuBVSQ9GdMbfaZGnUWBSWa2PnAbXfkAzwSekPRXSYdJWjjTZkOChuiWwImSVqw2ms0Gcc1szwbhOE4v0mn5jxbDHWBOzGwWsClwKPA6cKWkA6uqdQJXxvM/EHIDYmYnA2MJTnRf4PpMm2vM7H0zm0HIdLxZjWefZ2ZjzWzsrousUd6LchzHaYZrgToQpi+BCcAESVOAA5o1ybR9GviNpN8Br0dZtPnq1Ll2HMfpO1rQseXFR4A5kbSOpNGZojHAc1XVBhGyQkAY6d0Z2/6HYtI/YDTQAcyM17tKWjg6xK2B+0vvvOM4Tncxy3+0GO4A8zMCuFjSo5ImEzK8j6uq8x6wmaSphHW9it7XVwlrgA8BlwL7xdEkwGTC1Oe9BJ3Ql3r0VTiO4xShxClQSZ+V9ISkaZKOq3H/QEmvxy1jD0k6OHPvgBgs+JSkZrNvufAp0JyY2QPAJ2rc2jpTZ0Sdtns3MD3ZzPZP653jOE4PUZIUWtwC9itgB0KKuPslja9ExWe40syOrGq7FCGocCxhmeiB2DZJjNcdYIuxzMh0seDZs4cltR9RQrRXGULWCx17RrKNYQ9+Lan98GXSPxyenzk42cbGHWlC1O90DE3uw8hONa/UhLkz0gW151jaxNbgwem/328NSrexzKj0v/VSKC+6czNgmpk9AyDpCmBXQq7UZuwE3GRmb8a2NwGfBS5P6ZA7wD7EzMb1dR8cx3EaYQWCYCQdSoiUr3CemZ0Xz1ciJAWv8AKweQ0zu0v6NPAk8C0ze75O2+RkF+4AHcdxnPoUGAFGZ3de04r1+RtweVTbOoygorVtgr2GeBCM4ziOU5/ytEBfBFbJXK8cy7oeZfZGRW0L+D1h73Wutt3BHWABJHXEyKSpkv4kaZFEe6NixKjjOE7/ZG5H/qMx9wOjJa0uaRiwNzA+W0HSCpnLXYDH4vkNwI6SlpS0JLBjLEvCHWAx3jezMWa2AfBv4PA8jST5VLPjOK1JSVJoZjYXOJLguB4DrjKzRySdLGmXWO2oqKX8MHAUcGBs+yZwCsGJ3g+cXAmIScE/mLvPHcCGkr4AfA8YBrxB2OP3akx5tCawBvAvSf8N/DZeA3wdeAkYHNVhPkEY0u9qZu/35gtxHMepS4lpjszsOuC6qrITM+fHA8fXaXsBcEFpncFHgN0ijug+B0whqL1sYWYbA1cAx2Sqrgdsb2b7AL8EbjOzjYBNgEdindHAr6KA9kxg9xrPmyeGfeXM56tvO47j9BxtLIbtI8BiDI9qLhBGgOcTcvtdGeeuhwHPZuqPz4zmtgX2h3maom/Huexnzaxi8wFgVPVDs5FVT3z0c633W+Y4TstSZBtEq+EOsBjvm9mYbIGks4Gfm9l4SVszvzxanp2s2Z2/HcDwtC46juOUSAuO7PLiU6DpLEFXOG4jfbqbCet+SBosaYme7pjjOE4yHR35jxbDHWA644A/SXoAmNGg3jeBbWIapQcI64OO4zj9G18DdKC22LWZXQNcU6N8XNX1qwTdu2o2yNT5WXovHcdxysNa0LHlxR2g4ziOUx93gE5/YZnRs5Nt/NfkhZLanzqy0UxvPmbe/mGyjdRMDgBLXnlhUvsZux2U3IfFSlg6Gb35G0nt77pnxeQ+PDJ7ZLKNDyalfyQttfAHSe0HDU6Pehz1YXpmjHdeXjjZxrLJFmjrjPDuAB3HcZz6+AjQcRzHGYhYh48AHcdxnIGIjwCdRkjqIMiiVdjNzKb3UXccx3HKwx2g04QFFGKaIUmAzEpUmnUcxymZdt4G4RvhewBJIyTdLOlBSVMk7RrLR0l6QtIlwFRgFUnfkXS/pMmSftC3PXccx6mijTfCuwMsh+ExUe5Dkv4KfAB80cw2AbYBzogjPgjZH34dsz+sE683A8YAm0r6dLXxbDaIi597uTdej+M4DgA213IfrYZPgZbDfFOgkoYCP4rOrBNYCVgu3n7OzO6N5zvG45/xegTBId6eNZ7NBvHGFz7Ter9ljuO0Li04ssuLO8CeYT/CHtRNzexDSdOByq7WbIYIAaeZ2bm93D/HcZx8tHGUgk+B9gxLAK9F57cNsFqdejcA/ylpBICklSR9pLc66TiO0wzrtNxHq+EOsGe4DBgbMz/sDzxeq5KZ3Qj8Ebgn1v0zsFiv9dJxHKcZnQWOJkj6bAwEnCbpuBr3vy3p0RgUeLOk1TL3OjKxFuPLeGk+BVoC1VkizGwGsGWd6htU1T0LOKuHuuY4jpNEWSM7SYOBXwE7AC8A90sab2aPZqr9ExhrZrMlfR34CfDleK/wdrNmuANsMYZvv26yjX3vfyep/bMfpIlpA6w5+K1kG8OXSVeRThWzXuZ/z0/uwwErfirZxiOPbZjUfu2RM7HONAHnp95Jz/H89NwFMo4VZkWbk9R+nXGfSO7D0V+7JNnG08dvnWyjDGxuaaY2A6aZ2TMAkq4gpIib5wDN7NZM/XuBr5T29Br4FKjjOMnOz2ljCkyBZrdsxePQjKWVgOcz1y/EsnocBPw9c71wtHmvpN1SXxb4CNBxHMdpQBGtquyWrRQkfQUYC3wmU7yamb0oaQ3gFklTzOzplOf4CNBxHMepT3lBMC8Cq2SuV45l8yFpe+AEYBezrvlsM3sx/v8MMAHYuBuvZj7cATqO4zh1sc78RxPuB0ZLWl3SMGBvYL5oTkkbA+cSnN9rmfIlJS0Uz5cBtiKzdthdWtIBSjpB0iMxVPYhSZuXZHdPSY9JurV57W4/40BJ5/SUfcdxnDIpywGa2VzgSML+58eAq8zsEUknS9olVvspQRHrT1XbHdYFJkl6GLgVOL0qerRbtNwaoKQtgZ2BTcxsTvw2MKwk8wcBh5jZnSXZcxzHaWmso7wAKTO7DriuquzEzPn2ddrdDXystI5EWnEEuAIwozI3bGYzzOwlSZtKuk3SA5JukLSCpCXipst1ACRdLumQWkYlnQh8Ejhf0k8lDY7/VzI1HBbrbR2fc42kZySdLmk/SffFzA9rxnpfkDRR0j8l/UPScjWeuaykq+Mz7pe0VQ+9Z47jON2ixCnQfkcrOsAbCWmEnpT0a0mfieLTZwN7mNmmwAXAqWb2NmHIfZGkvYElzex3tYya2cnAJGA/M/sOYTT4tpl9HPg4cIik1WP1jYDDCcPyrwJrm9lmwO+Bb8Q6dwJbmNnGwBXAMTUeexZwZnzG7rH9AmRDiy+4O3nU7ziOkxvrVO6j1Wi5KVAzmyVpU+BThFRDVwI/JCis3BSzDg0GXo71b5K0J0GBYKMCj9oR2FDSHvF6CUKmhn8D95vZywCSniY4ZQhZ4beJ5ysDV0pagTBF+2yNZ2wPrNeVKYnFJY0ws1lVr3leaPHssw5vPcE9x3FallYc2eWl5RwggJl1EMJgJ0QNzSOAR8xsAfkxSYMII7XZwJKEzZd5EPANM7uhyt7WQFZqojNz3UnXe3o28HMzGx/bjKvxjEGEUeIHOfvkOI7Tq5i13sguLy03BSppHUmjM0VjCBFFy8YAGSQNlbR+vP+teH9f4MI4XZqHG4CvV+pLWlvSogW6ugRde1wOqFPnRrqmTJE0poB9x3GcHqdzrnIfrUYrjgBHAGdLGgnMBaYBhxKmCH8paQnC6/qFpLnAwcBmZvaupNuB7wEn5XjO74FRwIMKc5SvA7sV6Oc4QijvW8AtwOo16hwF/ErS5Njn2wlri47jOP0Ca+NFl5ZzgGb2AFBLrXYG8Oka5fPUo83s201sb5057wS+G48sE+JRq828e2Z2DXBNjWdcBFwUz2fQpXTuOI7T72jF4Ja8tJwDHOgc9PNXkm2ctvx7zSs1YM4H6b82j722dLKN52cOTraxWGJCiTIyObz/0h3JNiasf3xS+zLWQjZYfkayjaELp2f4ePPVIisVC7LjkTcl92H6nekZzvb/j18l27jyG83rNMMdYJshaSJQndPnq2Y2pS/64ziO01/xKdA2w8xKkU5zHMdpd3wE6DiO4wxIOkuUQutvuAN0HMdx6tLp+wBbk57KGpHYp1GSpvZ1PxzHcfJgptxHq9G2I8AezhrhOI4zIGjnNcB2HgH2SNaIeH9WzBTxSMz0sJmkCTE7xC6xzihJd0h6MB4L7F2sl3GiRr15YtjTZk0v471xHMfJhVn+o9VoZwfYI1kjIosCt5jZ+sC7BDHuHYAvAifHOq8BO5jZJoTN7r+sYadRxol5mNl5ZjbWzMauNWJU0ffBcRyn23g2iBakh7NG/Bu4Pp5PAeaY2YdRmHtULB8KnBP1PTuAtWvYqZdxolbmCMdxnF6no7N9x0lt6wChR7NGfGg2b8A/LxuEmXVKqryn3wJeJTjTQUCtjA81M044juP0F1pxajMvbevaezFrRD2WAF6OmqJfJYw2q0nNOOE4jtOjdJpyH82Q9NkYbzFN0nE17i8k6cp4f6KkUZl7x8fyJyTtVMZra+cRYG9ljajHr4GrJe1PmC6tJcCZmnHCcRynRylre4OkwYQlph0IM2z3SxpvZo9mqh0EvGVma8V4jB8DX5a0HrA3sD6wIvAPSWvHWb7u98naeXzbhry45bbJP7C/vbBiUvstBr2T2gVmflAtxVqc4YPThZNHb/5GUvvXH1skuQ//emvxZBtbP3Jaso2/bfC9pPYndE5L7sPvB62abGOpxWYn25jz77SxwRsfDE/uwwojZiXbWO/p/0v2Xg+usmvuz5xNnr+m7vPizNs4M9spXh8PYGanZercEOvcE5eTXgGWBY7L1s3WK/6KumjbKVDHcfKT6vzaiVTn1250dA7KfWS3bMXj0IyplYDnM9cvxDJq1TGzucDbwNI52xbGf9IN8KwRjuMMdIpIoZnZeYRlppbAHWADPGuE4zgDnRIXyV4EVslcrxzLatV5IU6BLgG8kbNtYXwK1HEcx6lLiVGg9wOjJa0uaRghqGV8VZ3xwAHxfA+C4IjF8r1jlOjqhP3S96W+Nh8BOo7jOHUpKwrUzOZKOpKw/WswcIGZPSLpZGCSmY0HzgculTQNeJPgJIn1rgIeJUT1H5EaAQot4AAldRDUVipcYWan52y7NXC0me2c8PwJ0cakbrS9CLjWzP5c5/5Q4BRgd4Kk2hzgZDP7e3f76ziOUyadJdoys+uA66rKTsycfwDsWaftqcCpJXan/ztA4H0zG9MXD477VnqSUwii3RvEjBXLAZ/p4Wc6juPkpqMF0xzlpWXXACVNl3RazPM3SdImMbvD05IOz1RdXNL/RfWA30bJMyT9JrZ7RNIPquz+WNKDZL6JSBok6SJJP6yXxUGBc+Kz/gF8pEH/FwEOIUihVaTUXjWzq2rUnRda/IdXX0p85xzHcfLTiXIfrUYrjACHS3ooc32amV0Zz/9lZmMknQlcBGwFLAxMBX4b62wGrAc8R1Bk+RLwZ+AEM3szjvJulrShmU2Obd6IWRyIznQIcBkw1cxOjXtb3jazj0taCLhL0o3AxsA68XnLEearL6jzutaK/W+6qzwbWlzGRnjHcZy8WAs6try0ggNsNAVaiSCaAowws3eBdyXNiRJoAPeZ2TMQ8vwBnyQ4wL2iIxtCmIZcD6g4wIqDrXAucFWcg4b6WRw+DVweF2dfknRLd16w4zhOf6HMNcD+RstOgUbmxP87M+eV64pzrx4xWQyjPRrYzsw2BP6PMHKsUK3beTewjaRKnUoWhzHxWN3MbizY92nAqpLSdbAcx3F6CEO5j1aj1R1gHjaL+04GERLT3gksTnByb8fAk881sXE+IXLpqrg5s14Wh9sJwq2DJa1AyENYEzObHe2eFffEIGlZhZyEjuM4/YK5BY5WoxWmQKvXAK83swXSaDTgfuAcwprbrcBfY96+fwKPE/Tl7mpmxMx+HjNIXArsR+0sDn8FtiWs/f0LaCbU+j1Ckt5HJX1AcMonNm7iOI7Te7TiyC4vng2ixZi08m7JP7DV1n0rqf2TU5ZN7QJS+u/de53p398+UNokyOjF307uw4tvL5Zs4+3EHTtfmPrD5D4cNvaYZBtHdnyYbCM1g/mKq6b/TB99Jv1vpAzHs+OrVyQb+dvy++T+Y/3CK5e3lLdshRGg4ziO00e04vaGvLgD7AUk/RVYvar4WDO7oS/64ziOk5d2niN0B9gLmNkX+7oPjuM43aGdt0G4A3Qcx3Hq0qH2nQLt19sgJHVEqbPKkTv6U9LWkq5NfP4ESWO72faizEb5WveHSfqFpGnxuFbSqt3vreM4Tvl0Fjhajf4+AmxnIewfAYsB65hZh6SvAddI2tTMWvF3yXGcNqSzfQeA/XsEWI82EcL+GvCtSk4rM7sQmAVsX6P+PDHsv7w3PeWtcxzHKUQ7i2H3dwc4vGoK9MuZe/+Ko8M7CELYewBbAD/I1NkM+AZB53NNghA2BCHsscCGwGckbZhp84aZbWJmV8TrihD2U2b2PeAgohA28HHgkCit9kW6hLD3Bz7R4HXVE8KeFNvPh5mdZ2ZjzWzslxYd1cCs4zhOuViBo9Vo5SlQF8J2HMfpYXwKtH/SykLYTxOEsKslQDYljAIdx3H6BR0FjlajlR1gHvqrEPZ7wMXAzyvBNpL2Bz4ghy6p4zhOb9Gp/EcKkpaSdJOkp+L/S9aoM0bSPTF+Y3J2WSzGaTybWTIb0+yZ/X0KtJ2FsI8Hfgo8IWl4tLOluTir4zj9iF4MST8OuNnMTo9b3o4Djq2qMxvY38yekrQi8ICkG8xsZrz/HTP7c94H9msHaGY1tyKY2ajM+UWEIJjqexMI63K12h/YzG683jpzflLm1nfjUc2RtezWedYc4CjgKEnLA38HvkrM/O44jtMf6EUHuCuwdTy/mPAZPp8DNLMnM+cvSXoNWBaY2Z0H9msHOFAws1eAjfPUnd6xaPLzRjz376T2wwanz/aPWGRO80pNGFnC6vwjs0cmtX/qnSWS+7DB8jOSbXz+5bQMBteWkMnh3Ek/SbbxyKb/nWxj1odpH2uTnl0+uQ8f+0j6z/TNtxZJtlEGVuDPLAYXHpopOs/M8n6pX87MXo7nrwDLNXnWZsAwQkxFhVMlnQjcDBwXBxp1cQfYw7gQtuM4rUyRRLfR2dV1eHGPdK1vGCdU2TE1yJkW4ywuBQ7ICIccT3Ccw2IfjgVObtRfd4A9jAthO47TypQZlGBmCwh9VJD0qqQVzOzl6OBeq1NvcUL0/glmdm/GdmX0OEfShYRo/4a0exSo4ziOk0BvRYES9nYfEM8PAK6priBpGCHg8JLqYJfoNInBibsBU5s90B2g4ziOU5deFMM+HdhB0lMEScjTASSNlfT7WGcvQnDjgTW2O1wmaQpBHGUZ4IfNHtgSU6CSOggvqsIVZnZ6zrZbA0eb2c4Jz58QbRTepC7pIuDaeqG5knYGTiF8GRkKnGVm53a3r47jOGXSW1GgZvYGsF2N8knAwfH8D8Af6rTftugzW8IB0qZZIeJm+vOAzczsBUkLEfYYOo7j9AvaeWNyS0+BtnpWCEI6pCHAGxD2BprZEzVe57xsEDfNnpbyljmO4xRirvIfrUarOMC2zAphZm8SFn6fk3S5pP0qzrmq3rxsEDssslaTt8pxHKc8PBtE39O2WSHM7GBJHyMs+h4N7AAc2KiN4zhOb9HZkq4tH60yAmxEK2eFCJ0xm2JmZxKc3+7dseE4jtMT9GIUaK/TDg4wD/0yK4SkETFKtcIY4LluvD7HcZwewadA+552zQoh4BhJ5wLvExzygQVel+M4To/SiiO7vMiz77QWT3z0c33+A7v0gwXSdBXmmB3SxYLnzkgX1J4+aWRS+6fnjkjuw5arvty8UhOee26ppPbDh36Y3IdBg9J/Ndd/4BfJNu7bIE3Ye6YNTe7Dc8PSd0+t9e8iKpy12fHVK5JjM783at/cP9gfTv9jS8WCtsoI0HEcx+kD+vwbdw/iDrCX8KwQjuO0Iu08BeoOsJfwrBCO47Qi7bwNwh2g4ziOU5f2dX85tkFIMklnZK6PljSuR3vVh8TX+4fM9RBJr0u6thu2Rkr6r3J76DiO03vMxXIfrUaefYBzgC9JWqanO9NPeA/YQNLweL0D8GI3bY0ECjnAqCU6UPZnOo7Tz2nnfYB5PmjnEjIWfKv6hqRlJV0dBaHvl7RVLJ8SRz+S9Iak/WP5JZJ2qPUQSQdK+ouk6yU9JeknmXuNRKubimFL+k5GtPoH1c+uwXXAf8TzfYDLM7Y2k3SPpH9KulvSOrF8fUn3xb5MljSakM9qzVj203p9kTQqimdfQkjiuEqOPjqO4/Q4rgQDvwL2i5vAs5wFnBkFoXcHKkkL7wK2AtYHngE+Fcu3JEiK1WMMQanlYwQ1lYojaCRa3VAMW9KOBI3OzaL9TSV9usnrvQLYO8qebQhMzNx7HPiUmW0MnAj8KJYfTsjlNwYYC7wAHAc8HaXSvtOkL6OBX5vZ+mY2nxpMNhvElTOfb9J1x3Gc8rAC/1qNXEEwZvZOHJ0cRVAsqbA9sF4QQgFC2qERBGf0aYKs12+AQyWtBLxlZtUam1luNrO3ASQ9CqxGUGlpJFrdTAx7x3j8M9YbQXA2tzd4vZMljSKM/q6rur0EcHEc4RkhiS0ExZcTJK0M/MXMnsq8LxXq9eVfwHNmdm+d/pxHGIX3i43wjuMMHFpxZJeXIlGgvwAeBC7MlA0CtjCzD7IVJd0OHAGsCpxASBG0B8ExNiIr7dEBDMmIVn/czN5SyLC+cI029cSwBZzWjSzr44GfAVsDS2fKTwFuNbMvRic5AcDM/ihpImHq9DqF/IDPVNms2Zdop9EXA8dxnD6hnbdB5A62iLnrriLkwatwIyHPHgCSxsS6zwPLAKNjGqI7CU6s7qirAUVFq6u5AfjPODJF0kqSGiWprXAB8AMzm1JVvgRdQTEHVgolrQE8Y2a/BK4hTJ2+S0h6m9oXx3GcPqEDy32kIGkpSTfFGJCbJNXUXJTUoa7csOMz5atLmihpmqQrJQ1r9syi0YZnEBxbhaOAsTGg41HCOliFicCT8fwOYCWCIyyEmT1MmDJ8HPgjOUSrq9rfGNvdI2kKIQ/gYo1bgZm9EJ1ZNT8BTlMQ0s6OoPcCpiqIdm8AXGJmbwB3SZoq6afd7YvjOE5f0YtBMMcRlsFGAzfH61q8n0lDt0um/MeEmJS1gLeYf7BWExfDbjH6wxqgi2F34WLYXbgYdhftJIZ98Kg9cv9gfz/9z91+nqQngK3N7GWFVHITzGydGvVmmdmIqrJKRp7lzWyupC2BcWa2U6NnuhJMizF7dtNRfVMe7kgbdH591ZeS+zDlb2kf2ABzLH275FILf9C8UgNWtHQn/OariybbWGqx2UntZ81eKLkPsz5M/zhJdV4Am039SfNKDbh1/e8m92GREoZDg/vJ2luRlxKDFQ/NFJ0Xg/jysJyZVb4NvgIsV6fewpImEbbonW5m/0uI05hpZpVvDS8QZh0b0usOUNJOhKFqlmd7UytT0tKEIXY128VpS8dxHAcKbW/IRqzXQtI/gOVr3Dqhyo5Jqvfg1czsxRh3cUtcTno7dycz9LoDjNkP+jQDQnRyY/qyD47jOK1AmdsgzGz7evckvSpphcwU6Gt1bLwY/39G0gRgY+BqYKSkIXEUuDI5FLxccstxHMepS4dZ7iOR8cAB8fwAQjT9fEhaUtJC8XwZguDKoxaCWW4lbLer274ad4CO4zhOXTqx3EcipwM7SHqKILJyOoCksZIqKmPrApMkPUxweKeb2aPx3rHAtyVNI6wJnt/sgaVPgcZ525+b2f/E66MJCi3jyn5WfyC+3svM7CvxegjwMjDRzHaWtAuwnpmd3pf9dBzH6Q69JXEWl6a2q1E+CTg4nt9NkMqs1f4ZgsxkbnpiBOjZIzJzz2Y23p2f4zitiothF8OzR8yfPeJASefE8z3jpviHo1wckgZL+lksnyzpG9XGlRHDvnrW9BzdcRzHKYdenALtdXpqDdCzR9TmRGAnM9sIqCgYHAqMAsaY2YbAZdWNzOw8MxtrZmN3HzGqSVccx3HKo7ek0PqCHtkG4dkj6nIXcJGkq4C/ZN6T31Y2cEbNVcdxnH5BO6uF9eQ+wF/g2SPmw8wOl7Q5Ybr0AUmbFnyG4zhOr9KKU5t56bFtEJ49YkEkrWlmE83sRIJu3SrATcBhMXoUSekaYY7jOCXhQTDdx7NHzM9PY8DPVMLa5sOEddB/AZPj3pZ9i/TXcRynJ2nnjPCeDaLFePuA7ZJ/YE/+Y/Gk9g8MShdv3nZEejaIwYPTf3cHDU773rr8uAW2LRVmxyNvSrZx9pD0n8myK85Kaj/p2VoSj8UYUsLn0dDED+JtHvlRch++tMlRyTZ27xiZbOPAF/+QnA3i86t+Pvcbet2/rkt+Xm/i2SAcx0l2fk77UoLEWb+l3ztAzx7hOI7Td7Ti1GZe+r0D9OwRjuM4fUc7R4H2ewfoOI7j9B3tHCfiDtBxHMepSzuPAEvfBiHJJJ2RuT5a0riyn9NfiK/3D5nrIZJel3RtvN5F0nF910PHcZzu02GduY9Ww7NBpOPZIBzHaVuswNFqeDaIvs8Gsb6k+2KfJksaXeO1zssGcdGTL1bfdhzH6TE8G0RxPBtEbWplgzgcOCv2aSzwQnWjbDaIA9deqUlXHMdxyqOdHaBng+j7bBD3ACdIWhn4i5k91aC94zhOr9LOUaA9qQX6C4IQdlajqZINYkw8VjKzWQTn8ql4TCAIRadmg9gu5tf7P7qXDaLSx7XM7Pwcr7eSDeLyehXM7HDgewQR7AckLW1mfySMBt8HrpO0bY5nOY7j9Aq9NQKUtJSkm+KS1k2SlqxRZ5u4XFQ5PpC0W7x3kaRnM/fGNHumZ4NYkF7NBiFpDeCZKKR9DWEK1XEcp1/QaZ25j0SOI8zqjSYoby0QPW9mt1YGJ8C2wGyCX6nwnczg5aFmD/RsEAu27+1sEHsBUyU9BGwAXFKkv47jOD1JL64B7gpcHM8vBnZrUn8P4O9mNru7D/RsEC3Gl1fbLfkH9tNl301qP3Th9P0+f30uPZjnrUHpv7ujPkwTrz969oPJfZh+51nJNu7c7vfNKzWgjB1ca37krWQb18/MM9nSmEUSX8zVg2cm9+EvDzb7LtycI8cem2zj3Ol/Ss7OsPHyW+X+Q3vo1bsPAw7NFJ1nZuflaStpppmNjOcixICMbFD/FuDnZlbZc30RIXByDnEEaWZz6rUHV4JxHMdxGlBkZBedXV2HJ+kfQK28WSdU2TFJdR8saQVC9H9WJ/p44BVgWOzDscDJjfrb7x2gZ4NwHMfpO8rMBmFm29e7J+lVSSuY2cvRwb3WwNRewF/N7MOM7Zfj6RxJFxLiSBrS7x2gZ4NwHMfpOzp7b5lsPHAAcHr8/5oGdfchjPjmkXGeIqwfTm32wH7vAB3HcZy+oxc1Pk8HrpJ0EGFP+F4AksYCh5vZwfF6FGEr2W1V7S+TtCxhK9tDzB9kWZM+c4BxfvfnZvY/8fpowsb0cX3Vp54kvt7LzOwr8XoI8DIw0cx27tPOOY7j1KG3EuLGmbbtapRPAg7OXE8n7BKorld4D3VPb4NohItmZ0Szs0Tn6DiO0+d0muU+Wo2+dIAumj2/aPY4SZdKugu4tOo1zBPDfnrW9ByPcRzHKQcr8K/V6EsHCC6aXS2avR6wvZntky3MimGvOWJUk0c4juOURzuPAPt0qs1FsxdgvJm9X6PccRynT+i0jr7uQo/RH9aafgE8CFyYKauIZn+QraiQQ+8IYFXCxskvki6a/XEzeyuqCHRHNPvcJs+upiKavTWwdNW9Rk7ccRyn12nFNEd56espUBfNdhzH6ceYWe6j1ehzBxhx0WzHcZx+iCfE7QHMbETm/FVgkcz1DELQSq12X82c300TJ25mFxGCWCrXO2fOD6zTZlSD9tl7ZxECdpqSfb2ZsgmE/Ie06/5Hx3Fam1Yc2eXFs0G0GO8ctEPyD+zG69IU97cc9XLzSk14Z8bCzSs1YZlR6Uum77yc1o8VDlszuQ9f+9lLyTZOGvJh80oNeGFW08mLpiw19IPmlZrw5ofpvxeDE0cizw8ZmtyHe4Y2TEKQi3MmVUsgF2foMmskZ4NYYeR6ud/Ql2c+mvy83qQ/BMGUgotmO47jlE8JiW77LW3jAF0023Ecp3xacW0vL23jAB3HcZzyaedlMneAjuM4Tl1aUeElLz2+DUKSSTojc320pHE9/dy+Ir7eP2Suh0h6XdK1fdkvx3Gc7uD7ANPwrA+e9cFxnBalnfcB9oYD9KwPDbI+SFpf0n2xD5Mlja7x2uZlg7jw8RdyPN5xHKccOjo7cx+tRm8pwXjWh/nJZn04HDgr9mEssICHy2aD+NpHV27yaMdxnPJo53RIvTIF51kfFiCb9eEe4ARJKwN/MbOnGrw+x3GcXqWdg2B6cw3qF3jWhwrznLiZ/VHSRMKU6XWSDjOzWwo+y3Ecp0doxeCWvPSaGLZnfaiNpDWAZ6JA9jWEKVPHcZx+QW9NgUraM8ZpdEoa26DeZyU9IWmapOMy5atLmhjLr5Q0rNkzezsbhGd9WJC9gKmSHgI2AC4p0j/HcZyepLOzM/eRyFTgSzQY6EgaTIgp+RxhKWsfSevF2z8mxJSsBbzF/IOtmvT4FKhnfWic9cHMTgdOz2PbcRynt+mtCVAzewwgExNSi82AaXFmEElXALtKegzYFtg31rsYGEeIIWn4UD/a7AAO7cv27WSjP/Shv9joD33oLzb6Qx/KslHmARwKTMochftHGCyMrXNvD+D3meuvAucQZhanZcpXAaY2e1Z/SYibG0k7xT1z2eOvvdyHpWv04aGYDaI/cGgft28nG/2hD/3FRn/oQ3+x0R/6UJaN0rDMlq14nJe9L+kfkqbWOHbti/62nBKJedYHx3GclsTMtk808SJhdFdh5Vj2BjBS0hAzm5spb0jLjQAdx3GcAcv9wOgY8TkM2Juwr9qAWwlTpAAHEKLqG+IOsD05r3mVHm3fTjb6Qx/6i43+0If+YqM/9KEsG/0CSV+U9AJB8ev/JN0Qy1eUdB1AHN0dSZgFfAy4ysweiSaOBb4taRph7/X5TZ8ZFwwdx3EcZ0DhI0DHcRxnQOIO0HEcxxmQuAN0HMdxBiTuAB2nB5G0SPNaTiNiXtBVmtfs/0gaLOmyvu6HE3AH6AAgaU1JC8XzrSUdFdNB5W2/nKTzJf09Xq8nqakWXx1by0vaRdIXJC3fjfYLSdpX0nclnVg5Ctr4pKSvxfNlY1aRIu0/EfVtH4/XG0n6dREbsd1qkraP58MlNdWhjXV3krRHjfI9VCepdANb28ffh6MkfaJg20MUkzxHR3ahpHei/u8meWzEEPdaacUKIemI7O+0pCUl/VdBG0l/J2bWAayWR6i5p/rgdOEOsIWR9G78MKl5FDR3NdAhaS1CaPUqBBHwvFxECE1eMV4/Cfx3wT4g6WDgPoIo7h7AvZL+s6CZa4BdgbmETCCVI28fTiKEVB8fi4YCfyjYhzOBnQgbdLEgyt4skXJ1Pw4hiK9XUnGtDPxvzuYnArfVKJ8AnJzz+atIehD4PjAqHqdJuj5+yTg4h5lvAtPj+T6EbCerA98mp75u5EFJHy9QvxaHmNnMyoWZvQUcUtBG6t8JhCTfd0n6vqRvV45e7oNDCyrBOF2Y2WIAkk4BXgYuJeQv3I+Q+LcInWY2V9IXgbPN7GxJ/2zaqotlzOwqScfHvs2V1FGwDwDfATaOajtEebm7Ceml8rKymX22G8+u8EVgY0L+Sszspbwjryxm9rzmF/Yt+n4cQRD/nRjtPaV8qbgAFjKz12v0aYakRXPa+BXwSwtC8fOQtD8hkbMBv29iY66ZfRjPdwYuiT/bf0j6Sc5+AGwO7CfpOcKXGREGh0XShw2WpDiirGQWKDoSS/07AXg6HoPIkVmmh/rg4A6wXdjFzDbKXP9G0sOEUUBePpS0D0FB4QuxbGiB9u9FZ1X5cNkCeLtA+wpvAO9mrt+NZUW4W9LHrEkuxgb828xMUuW15HUYWZ6P04UmaShhJPRYQRtzzOzfFScqaQj5xfkXV5cs1DxiX4bntPHRaucHYGaXSPoRkGcKs1PSCoT0NNsBp2bu5e0HhNF0KtcDV0qqjKgPi2VFSP07wcx+UPCZpffBCbgDbA/ek7QfcAXhA3IfCkz5Rb5GyMd4qpk9G9e8Li3Q/tvAeGBNSXcBy9IlS1SEacBESdcQXsuuwOTKFJGZ/bxeQ4V8jUb4vf6apGeAORQfLVwVPyRHxmnI/wR+V/B1HE6Y4luJoEl4I2FEV4TbJH0XGB7X7f4L+FvOtn8BfifpSDN7D0AhqfNZ8V4eaualkTQIeN/MXsth40RCVoDBBMmqR6KNzxCmAnNhZs9J+iQhSfaFkpYFFkg91oRjCeLRX4/XN9F8BFtNt/9OJI1vdN/MdunpPjjz40owbYCkUYQPtq0IDuAu4L/NbHo37S0JrGJmkwu2GwKsQ/jgfCIz9VXExkmN7jf69ixptSZtn8vxfBHW2j4K7Eh4LTeY2U3N2pZNdDQHZftBSAXT9I82/ix+CBwMVF73qgR5qO/n+dlIOpPgZP4740QXJaxvvm9m38z5OoYAi8U1t0rZooTPn1nxeodG73H8vRgLrGNma0taEfiTmW2Vpw817C1FmCov9DteZaPQ34mk14HngcsJ09rzfcEws1prttU2BhOmkfcr3mOnGneADgCSJgC7EEZPDwCvAXeZWa7FeUlfqlH8NjAl50ihls0lgZl5PvCr2m0BPGJm78brxYF1zWxizvZTzOxjhTsc2p5Ng2lKMzuqO3a7i6ThwFrxcpqZvV91v67jidOlpwEHMr8TvRj4rpn9u8R+PmhmdadUJT1EXJc1s41j2eQia4B1fsfvNrNvJdrI9XcSndcOdAUD/R9weUbLMm8f7gS2LfP9H6j4FGgbIGltQubj5cxsA0kbEtYFf1jAzBJm9k6M7LvEzE6SVOTb8UEEEdtb4/XWhA+I1SWdbGYNp2gUtilcZWaPK4R4/52QcmqupH3N7B8F+vIb5l+fmlWjrBEPSvq4md1f4JkVJnWjzXxkpnJrUuRDPzq8RmuhPyZMBdZq+yFwtKTv0+VEnzaz2VX9bTh6y0nDNOCUsy6b+jueZCNugbgeuD7+ju8DTJD0AzM7p0AfKlGk48ksdTRaHnBq4w6wPfgdIXryXAAzmyzpj4QpsLwMicEKewEndKMPQwijrFch7AsELiFE791O8zWKLwOnxPMDCBFyywJrE0YcRRzgvEg/ADPrjNNweel2xKGZXVzgOfXYuQQbeWnmeJKcaAGajfJrrcsWXb9L/R1PthEd338QnN8o4JdA0YTeqVGkTsQdYHuwiJndVxVyP7de5TqcTFhjutPM7pe0BvBUgfarVJxf5LVY9qakPGuB/844rZ0IU0MdwGMFnRfAM5KOIoz6IASP5A64oISIwxikcSywHrBwpdzMtm3WNs9aZYmUsQbS1ImmYmY/i4FA7xDWmU/sxqgz9Xc8yYakS4ANCJv6f2BmUws+GyglitSJ+BpgG6CgvnIkIShgEwUFkIPM7HO92IdfE9aH/hSLdgdeIIxMrzWzbZq0v5cQsPEq8ASwqZk9G+89bmYfLdCXjxC+WW9L+IC/mRDIUWgtMtrJOq9/FWh7I3AlcDQhYu8A4HUzO7aAjS2As4F1CfvVBgPvmdnieW3keEbDtbdetPEXM6u1jly5/+Pq965WWX9GUiddU5bZD97KDEPDn2uJUaROxEeA7cERBEWIj0p6EXiWsBk+N5IWJqzjrc/8H/p5VViOIKi3fDJeTyKsSb4HNHR+kW8SVE+WBc7MOL/PA7k3+cZAgzPNbO+8bWrY2AU4g6Bq8xqwGmEP3/oFzCxtZudL+maM7rtNUtE1xXMIGa//RIiA3J8wJVwm00u2Nx91gqPmYWZ/if83rEcIHql2dp+rUVarD8eY2U/qBSjlCUwqw4aZ5VLekrRkNmI2w5Y0iCJ1iuMOsD14zsy2j4EBgyrRjwW5lKBbuRNhmmc/CmzcjgEKzwBbAHsSnPDVBdpPJGw9qC6/jgI6kGbWoaCfOSwhSu4Uwuv4h5ltLGkb4CsFbVSmfV+W9B/AS8BSRTtiZtMkDY7TwRcqKH4c36xdiY4nD9Mb3Kts1P4I8Anglni9DUHhp+GeRElfJ0xhr1EVbLIYYbtPHiq/xykBSmXYyMvN1A7YWp6uKNJ96WYUqdOFT4G2AZL+RVS5AG4pum0g2vhn/LCfbGYbxhD4O8xsiybt1ib8Qe4DzIh9ONrMGu7Ja2BvaeAkwkjSgDuBky1Ko+W0cQlh2rBbUXKSJpnZWAU1nY1jEM3DNr/aTjMbOwN3EHQazwYWJ6z7NJzGqrJxO7A9IdjjFYLc3YF5+iHpwnha0/GYWdNAm7xONA9xSvgAM3s5Xq8AXGRmDddbJS0BLEnYjnFc5ta7ZvZm3ue3EpW/xSZ1KlGkPyX8XhWJInUiPgJsDz5KiBw8Ajhf0rXAFWZ2ZwEblRHLTEkbED5w8+hOPk74oN/ZzKYBSMq9r6oGVxCiRneP1/sRnOr2BWykRsnNVFBNuR24TNJrFFTWMbNr4+nb5JsCrsVXCet+RwLfIjjT3Ru26Hp+JZPFjcB61Y4n5/OTRm9VrFLpQ+RVwppxQ8zsbeBtSd8DXjGzOZK2BjaUdIllxK3rUcbaWS+vv9X9AltSFKkT8RFgm6GwefwsYD8zG1yg3cGEKcsNgQsJCiAnmtlvm7TbjbBOtRVhFHoFQa2kUPqgjL2pZrZBVVm3N6YXfPaqZvavOJX8PsGB7gcsAVxWcBS6OvANwofUvC+avR2oIOkxM1s3cz2IIBKwboNm1Ta6NXqrsnEOMJqwfgVh28s0M/tGzvYPEdZBRxGmxK8B1jezz+doW4YCS7KNvNQLKqqKIr2iu1GkThfuANsEBW3FLwOfJaxTXGlmudfgSnj+ogTdzn0I0ZeXAH81sxsL2vk5IR3SVbFoD2AzMzu6gI1lgWNYMKCn4RaE7AePpKvNLNdoq46thwmyY1OAzkwfcn9QxmnUUwhBOEPIGS1YZSPJ8UQbyU40tvsiXSmhbjez3COXys9G0jEEGbaz80wVxrbJCixl2CjwrJqvKzWK1FkQd4BtgKTphEjJqwiCw0Vy3zWUcMq7blZlc0lCIMyXzWy7nG3eJfxRC1iUrtRBg4FZBT/0u7UFIfvBk/fDtYGtiWa2eXfbRxvTCJG1U7qzrpux023HE9snO9FoZzWCmPU/JC0CDM4bsCVpIvALwubzL1gQgV5gtiCHneS1s5JsLCDsbV2Rz0ulrG82iCJ1qnAH2AZIWtzMiibArbTttvh0f0XSA2a2qTJakZLuN7OGCVWrRoBJe9sk7UtwGjcSMlIAYGYPFrBxK7CdmXU2rdzYTrcdT8ZGqhM9hJCJYSkzW1MhS/xvC3xBWo/wZeYeM7s8TjHvZWY/ztm+eu1sPHCBmb1Y4DUk24h2ShX2rmE/eV/mQMEdYAtTxt6k/oKkj1rQAa35h1vQcdxrZltIuoEQJPAS8GczW7NJuw66pM+GAxXNy+5MPZ5GCGJ5mq4pUGs2DVtl4+OEKdDbmN+J5h6VpzqejJ0kJxrX8DYDJmZG2b21tpu8dlbm+ptKEPZuYj9p9mIg4VGgrU1pe5MkXQx8sxJVF6cxz7D8G+FT+Tbhg/qMTFnWqed2HMAPY/j8/9C1BaFpZGqRoKEc7AmsYWmK/acShLwXpnjm8gopWeWB+Z0osCYhx+FvCQlu85KS3JfouE9jQWm5NXI0/wrhi803gaPUJRlY5ItNGTYqlCHs3Qgf1eTEHWALY2aV5KhTioyQ6rChZULKzewtSb35LfL3kpa3KJkm6QBCyP90YFweAwpqNocTMhesBJxvTSTYepCpwEiCkkx3WbHoGlcNkhxPJNmJkpbcF0Jk8kmEXITbEJLC5lJWsXQFllJsZCgj4bJTArl+qE6/5wxJj0k6RWEPX3cYFEd9QFiIp3e/IP0W+Hd89qcJ3/YvJuyjOy+njYsJaytTCDJZZzSu3qOMBB6XdIOk8ZWjoI3rJO2Y2I9qx/MnijkeiE60ctFNJ3oc8DrhZ3MYcJ2ZFcmmMNzMbiYs2zxnZuMI63FlcnNv2DCznxFk/66mS9j77BKeXcEl0nLiI8A2wMy2kbQ8IUXLuQoJYK+0YvkAzwDulVTZfrAnYQqutxiciXz7MnCehW0cV8c1kzysV1lTknQ+YTtFX9EwuCgnXyfk45tDECroznTbcQSN16zjKTraSB29AYwzsxOJIx1JgyVdZvkzm8+J2y+eknQk8CJhr2qZlOE4ctkws5tiZOsQKB752SiKlGJT0wMaHwG2CWb2ipn9kjAF+BBwYsH2lwBfJCh0vAp8yZoksS2ZwepKe7QdXaojkP+L2ry0S2ZWNB1UqVjY7zcdGBrP7wcKTVOb2WJmNsjMhpvZ4vF6nvOTlEece5yZ/c7M9jSzPYALJF1WpB+kj94AVpF0PICkYYTRT5FURN8EFgGOAjYlBBgdULAPzShj7aypDUmHSXoFmExYv3+AAuv4MYr0WLo0YYcCf5jXgTaViOsJfATYBkhalzBq2h14g7AH7n9yts2um00hRAj2hfO4nDDSmEFQYbkj9m8twjRoHjaSVNkOIsKI5R36YKNwSYEjzbiU5lnuV5F0vJmdFh3PVYQvSEVIHb1BWOe6LDrBbYC/m9mZeRubWSWTxizC+l8rczSwgZnN6Gb7LxKjSAHM7CVJnhi3G7gDbA8uIEiQ7WRmLxVsezFh5HQHYd1sXeC/S+1dDszsVEk3AysAN1rX/pxBBEmxPDbKjOJMpYzAkWbkmW5LcjyRbjvRqm0tZwHnErI43CZpk2bBW83WTa1cabnemgJ9mq4tNt2hp6NIBwy+D7DFUZBoutTM9u1m+3l7seIU5H2+iTYdRSUYdWXZGELY91XKXq/4jLobnqscz1C6HM/5UHhfpYDLCDMEhZyowmb+ejTdF6mSNTgbrZ3lXYdLtRGjqy+Mrye7vzPXvl1JRxNEFnYgBIv9J/DHkgNpBgTuANsASXcQFEMK7zmr/hBt9KHq5EfST4CZhCS23yAEjjzajbWzRs9o5ACTHE+0UYoTjcEre5rZlXnqV7UtTYNTJSiwlGTjPkKar2qd2IsL2NgB2JHwheAGM7spb1unC3eAbYAS8t+pS/0E5ldAcYHdBOKH/kFkPqQIWTJK+4NTVLxp0oduOZ7YPtmJZmxNMrOx3elHxkaSBqdKUGApycY/rQSllhjtnc004sEvBfE1wPag2/nv+tm6Wcsj6WYLMmOnWRDf7vYGZ0kHmdn5mevBwPcs6rM2cn7xfqek7xCCogoTt9ckOdEM/4hTd1cy/5e0PFOOZeXAK2PtrAwbf5d0KGErSXYKNJcDk3QY8APgA8IIUoTo0zyqOE4GHwE6TolIehQ4mDBNuC8LrlkVWXv7I2FD/UGEaNKLgNusWGqo04EZdMPxZGyUMXp7tkaxWRMpM5WrwZm8dlaSjW69F5n2TwFbJkSROhF3gG1AnKqqJYZdRD/TKQFJexAc1idZcG9XoWnDaO/LwK8IzmtfM7urYPukD9toI9mJdheVnAOvjLWzvl5/k3Q9YZ9uSiSpgzvAtkDSppnLhQn7Aeea2TF91KUBj6Tvm9kpDe6v3yyQQ0EA+mJCsMS6wKPAt3v7g68MJxrtbMCCYtaXJHavYjt3Drwy1s5SbaS8F6lRpE4X7gDbFEn3mdlmfd0PpzZ5om0lPQ4caSEFkQgZM/7TzPIowGTt9JjjKdCHk4CtYz+uI+w5vdOCOk0Z9vO8nzXXzgqOhsuwkfRelBFF6gQ8CKYNUBCurjCIEKa9RB91x8lHng3Tm1lMdByjR8+QVEiDs96HLVDIAZbgRPcANgL+aWZfk7QcGfmuEsjzfqYqsJRlI/W9GGpm3054vhNxB9gePEDX+shcggblQX3WGycPeaZehks6E1jJzD6rkBV9S+DJAs9JdjwlOdH3Y1Tq3Dh9+BqwSpF+NCHP+5mqwFKWjdT3IimK1OnCHWALo5Ax/HkzWz1eZ3PoPdqHXXPK4SLCWk9l8/yThECU8+s1qEEZjqeM0dskSSMJ20IeIGh63lPQRirHA3crZGHo7tpZGTZS34t9Mn2Z1wV8G0Rh3AG2NucC2wPZHHrfAMYQcuiVsr7i9Ah5VHuWMbOrFLMomNncKFxQhDIcT7ITNbP/iqe/jVGMi5vZ5IL9aESeKdBzCVlG5ls7K0iyjdT3ovKF10nHHWBrU0YOPacHSN3EHnlP0tLE6T1JW5A/MwbxOWU4nlJGb5K+RNgeYoQp1EL9UHoOvDLWzkpZf5O0ErAaXfkAP21mtxdo3+eBTe2AR4G2MJKmAmPiyOBx4NDKH5GkqWbW3ezwTiIlbWLfBDibsBF8KrAssEdRB1bteMysOyoqFVuj6IYTlfRrQsqty2PRl4GnzeyInO3L0OD8EWF5oNtrZyXZ+DHh9T8KVEb0ZjkzW/R0RO1Awh1gCyPpBODzhE3KqwKbRJmmtYCLi3w4OOXT3U3smbXdVxSySBxGWNt9FDix4IdtkuPJ2ElyovEL2roxmrWiU/qIma2bs/1DpGtwliEKUIaNJ4ANzWxO08q120+ha012o8qarJnt0B17AxmfAm1hrIQcek7PEDexf5OQ+Xxd4KsKIsh5Igjnre0CnyAEwXR3bXdb5nc8FwOFMinUcKKHSdq+oBOdRviS9ly8XiWW5SVZg7OMtbOS1t+eIWTX6JYDpOcjagcM7gBbHDO7t0ZZkTB5p2f4GwtuYr8fyLOJvcy13VTHAwlONO5bNIJI+2NxE7cBmwP3FejDVZLOBUZKOoSgwVlYaLyMtbMSbMwGHopfXrsTSdofImrbAneAjtMzpGxiHyxpiJnNJQR3HJq5l+tvtkTHA2lO9GcFn1UTM/uZggbnO8A6hKngQhqcZexnLGlP5Ph4dIteiKgdMLgDdJyeIWUT++XAbZJmAO8DdwDEtd28UaDJjqcMJ2pVGdtVpaFZBDO7Ke6/q0RO5srgnqGM/YzJNppJlkm62sx2b1InKYrUCbgDdJye4SK6uYm9jLXdkhxPKaO3+PxDgZPpZg47lZMDr4y1s95Yf2uWIqpmFCngDrAg7gAdp2dI2sRe1tpuiuMpc/QGfIc0Dc0yNDjLWDvrjfW3ZqH5uxG2g3Q3iMaJuAN0nJ4heRN7SaQ6nuTRWyRVQzNZg7OMtbN+sv6WGkXqRNwBOk7P8G1CoMOaku4ibmLvg36UId6c7ERJ19AsQ4OzlLWzXlh/aybrlhpF6kTcATpOiWQ2sT8o6TN0bWK/EXihD7pUhuMow4mmamgma3CWsXZWko3tgLvN7P06VY5tYiIpitTpwpVgHKdEJD0IbG9mbyoIlF9B1yb2dXtbrkolJE9VCRnIowjAxnnrl90+2khSYCnRxsWEiOA3CRG+txPUdXJltM9hv2kUqRPwEaDjlEt/EygvQ7y5jCwKqTnsysiBV8baWbINMzsAIOqZ7kGQy1uR8j6PPS1STtwBOk65JG9iL5kyHEcZTjQ1h10ZOfDKWDtLtiHpK8CngI8RdHzPIe71LAmf1suJO0DHKZcyNrGXSRmOI9mJpmpolqTBWcbaWRk2fkFYV/0tcKuZTU+053QTXwN0nJKJWx4qm9jfi2VrE/LXPdinnesGKRkQJB1jZj+J53ua2Z8y935kZt8t0I8ezYFXxtpZXhuS1gc+TciwMRp4wsy+mvLsjO3k9dKBwqC+7oDjtBtmdq+Z/bXi/GLZk73p/CQdkznfs+rej4rYMrPVaxx5R5B7Z86Pr7r32bx9iBqcZ8djG+AnQK78eQUoY+0sz5eCxQnaqqsBo4AlKLC2Kmk7ScMbVGkWRepE3AE6TnuS7HhKcqKqc17ruhF7ENZUXzGzrxH0OJco0D4PZUyH5bFxJ/AFYDLwZTNbpxIYk5P9gYcl3Svpp5K+IGnJeR0wu7FYlwcu7gAdpz0pw/GUMXqzOue1rhvxvpl1Ai2fA8/MNoyKMv9rZoX3hprZAWa2NvAl4HlCFOnrJXdzQOBBMI7TnpTheMpwohtJeifWHx7PK+0Xrt9sAXpDg7PIiLTbNiRtSRBFHwGsKmkj4LCMzFqz9j0dRTpg8CAYx2lDovD2e0THQ5eSi4CFzWxoDhsPmtkm1ee1rnsTSaPohgZnMwUWSTs2mz4sycZEwpTu+EqwiqSpZrZBztcxA48iLQV3gI7j1KQMJ1pyf+bT4AQoosFZhgJLSTYmmtnm2WhNSQ+b2UYFbPRYFOlAwqdAHcepiZkN7us+VChDg7MMBZaSVFyel/QJwCQNBb4JPJa3cWoUqdOFjwAdx+n3lKTBWb12didwh5nlXkssycYywFnA9oTR9I3AN83sjZztJ8fn3gnc3p1AGifgDtBxnH6PpL8De5rZrAQbyWtn/Wn9TdIiZpaapWNA4w7QcZx+j6SrCXv/knLglbF21l0bkk5scNvM7JScz58XRWpmhaNInS58DdBxnFYgWYOzjLWzRBvv1ShbFDgIWBrI5QAJWqI7Ed8PM3s4pt5yCuIjQMdxWp48GpxlrJ2Vtf4maTFC8MtBwFXAGWb2Ws62yVGkTsBHgI7jtANNNTjNbENIWztLtSFpKeDbwH7AxcAm3UiEmxRF6nThUmiO47QDTaeyJG0p6VHg8Xi9kaRfF3lIig1JPwXuB94FPmZm47qZBf5w4AhgJeBFYEy8dgriU6CO47Q8eZRpUhVYUm1I6iQE8MxlfoctQhDM4nn74ZSDT4E6jtMO5NLxNLPnpfmqdtSrW7YNM0uacSsritTpwqdAHcfp95SUA2++tTNJR1N87awMG93lvRoHhEAazwHYDXwK1HGcfk9JGpxJCixl2SiDlChSpwt3gI7jtAwZDc6jgRXNbEAt49SIIj2rm4E0Dr4G6DhOC5CSA6+MtbP+sP4Wo0i/BJxHiCLttiycE/ARoOM4/Z4UDU5J/1OjeJ4Ci5mN6A0bqXgUafm4A3QcpyUoScczee3M19/aB58CdRyn35Oq41mGAktJKi5OP8IdoOM4rcCdmeOcIhqcZayd+fpbe+JToI7jtAzd0eAsY+3M19/aE3eAjuP0ezwHntMTuBKM4zitwC8IOfDegJADjxAQ4zjdxh2g4zgtgZk9X1VUWMfTcbJ4EIzjOK2A58BzSsfXAB3H6ff0Fw1Op71wB+g4juMMSHwK1HGcfkt/0OB02hcfATqO02/pDxqcTvviDtBxnJbANTidsvEpUMdx+jWuwen0FO4AHcfpt7gGp9OT+BSo4zj9FtfgdHoSd4CO4zjOgMSl0BzHcZwBiTtAx3EcZ0DiDtBxHMcZkLgDdBzHcQYk/w8qs06yZs5DEwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.heatmap(train1.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.736642Z",
     "start_time": "2020-08-05T13:41:05.724263Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Survived           1.000000\n",
       "Sex_female         0.543351\n",
       "New_name_Mrs       0.339040\n",
       "New_name_Miss      0.327093\n",
       "Fare               0.257307\n",
       "Embarked_C         0.168240\n",
       "New_name_Master    0.085221\n",
       "Parch              0.081629\n",
       "New_name_Misc      0.022030\n",
       "Embarked_Q         0.003650\n",
       "SibSp             -0.035322\n",
       "Age               -0.064910\n",
       "Embarked_S        -0.149683\n",
       "Pclass            -0.338481\n",
       "Sex_male          -0.543351\n",
       "New_name_Mr       -0.549199\n",
       "Name: Survived, dtype: float64"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 进行排序好\n",
    "\n",
    "train1.corr()['Survived'].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**从现在的相关性分析可以看出 相关系数小于正负0.1的似乎相关性都不高。到时候我们剔除这些特征再分析一下看下效果**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.745390Z",
     "start_time": "2020-08-05T13:41:05.738477Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.616162\n",
       "1    0.383838\n",
       "Name: Survived, dtype: float64"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train1.Survived.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "假如我们全猜0的话，准确率也有0.6。这叫做空准确率。所以接下来的模型准确率。一定要高于这个数字"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型预测"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "接下来我们将初步利用上面的数据来训练模型，看下准确率"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.758248Z",
     "start_time": "2020-08-05T13:41:05.746981Z"
    }
   },
   "outputs": [],
   "source": [
    "MODEL = [\n",
    "#     AdaBoostClassifier(), \n",
    "    RandomForestClassifier(), \n",
    "    LogisticRegressionCV(), \n",
    "    BernoulliNB(), \n",
    "    GaussianNB(), \n",
    "    SVC(), \n",
    "    DecisionTreeClassifier()\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.786551Z",
     "start_time": "2020-08-05T13:41:05.760509Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \\\n",
       "0         0       3  22.0      1      0   7.2500           0         1   \n",
       "1         1       1  38.0      1      0  71.2833           1         0   \n",
       "2         1       3  26.0      0      0   7.9250           1         0   \n",
       "3         1       1  35.0      1      0  53.1000           1         0   \n",
       "4         0       3  35.0      0      0   8.0500           0         1   \n",
       "\n",
       "   Embarked_C  Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  \\\n",
       "0           0           0           1                0              0   \n",
       "1           1           0           0                0              0   \n",
       "2           0           0           1                0              0   \n",
       "3           0           0           1                0              0   \n",
       "4           0           0           1                0              0   \n",
       "\n",
       "   New_name_Miss  New_name_Mr  New_name_Mrs  \n",
       "0              0            1             0  \n",
       "1              0            0             1  \n",
       "2              1            0             0  \n",
       "3              0            0             1  \n",
       "4              0            1             0  "
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.796499Z",
     "start_time": "2020-08-05T13:41:05.789225Z"
    }
   },
   "outputs": [],
   "source": [
    "x, y = train1.drop('Survived', axis=1), train1['Survived']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.807953Z",
     "start_time": "2020-08-05T13:41:05.798639Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 查看内置的属性\n",
    "# dir(DecisionTreeClassifier())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:05.819711Z",
     "start_time": "2020-08-05T13:41:05.810420Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sklearn.tree._classes.DecisionTreeClassifier"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DecisionTreeClassifier().__class__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.414957Z",
     "start_time": "2020-08-05T13:41:05.822800Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>    0.825112\n",
      "<class 'sklearn.ensemble._forest.RandomForestClassifier'>        0.811659\n",
      "<class 'sklearn.naive_bayes.GaussianNB'>                         0.798206\n",
      "<class 'sklearn.naive_bayes.BernoulliNB'>                        0.766816\n",
      "<class 'sklearn.tree._classes.DecisionTreeClassifier'>           0.757848\n",
      "<class 'sklearn.svm._classes.SVC'>                               0.632287\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "def modeling(model_list, x, y):\n",
    "    d = {}\n",
    "    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2020)\n",
    "    for model in model_list:\n",
    "        model.fit(x_train, y_train)\n",
    "        score = model.score(x_test, y_test)\n",
    "#         print(f'{model.__class__} : {score}')\n",
    "        d[model.__class__] = score # 获得属性的名字\n",
    "    print(pd.Series(d).sort_values(ascending=False))\n",
    "\n",
    "modeling(MODEL, x, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**从上面的结果可以看出，逻辑回归和随机森林的效果似乎还不错。这还是在没有调节参数的情况下**\n",
    "\n",
    "下面使用网格搜索进行调参"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 网格搜索"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**TODO**\n",
    "\n",
    "1. 上面不是进行了一些相关性分析吗，看看删除一些特征准确率的变化情况\n",
    "2. 使用网格搜索，查找最佳参数，提升模型的准确度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.418809Z",
     "start_time": "2020-08-05T13:41:07.416539Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# help(LogisticRegressionCV)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.434657Z",
     "start_time": "2020-08-05T13:41:07.420053Z"
    }
   },
   "outputs": [],
   "source": [
    "model_param_dict = {\n",
    "# 'LogisticRegressionCV'\n",
    "    0:{\n",
    "    'penalty':['l1','l2','elasticnet'],\n",
    "    'solver':['newton-cg', 'lbfgs','liblinear','sag','saga'],\n",
    "    'class_weight':['balance',None],\n",
    "#     'cv':[2,3,4],\n",
    "},\n",
    "\n",
    "# 'RandomForestClassifier'\n",
    "    1:{\n",
    "    'n_estimators':[90,100,110],\n",
    "    'criterion':['gini','entropy'],\n",
    "    'max_depth':[i for i in range(15)],\n",
    "    'min_samples_split':[i for i in range(5)],\n",
    "    'class_weight':['balanced', 'balanced_subsample'],\n",
    "},\n",
    "\n",
    "# GaussianNB_param = {}\n",
    "\n",
    "# BernoulliNB_param = {   }\n",
    "\n",
    "# 'DecisionTreeClassifier'\n",
    "    2:{\n",
    "    'criterion':['gini','entropy'],\n",
    "    'splitter':['best','random'],\n",
    "    'max_depth':[i for i in range(5,15)],\n",
    "    'min_samples_split':[i for i in range(5)],\n",
    "#     'minz_samples_leaf':[i for i in range(5)],\n",
    "    'max_features':['auto', 'sqrt', 'log2'],\n",
    "    'class_weight':['balanced',None],\n",
    "    \n",
    "},\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.446976Z",
     "start_time": "2020-08-05T13:41:07.439372Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LogisticRegressionCV()\n",
      "{'class_weight': 'balance', 'penalty': 'l1', 'solver': 'liblinear'}\n",
      "0.8251121076233184\n",
      "--------------------\n",
      "RandomForestClassifier()\n",
      "{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 4, 'n_estimators': 90}\n",
      "0.8251121076233184\n",
      "--------------------\n",
      "DecisionTreeClassifier()\n",
      "{'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'min_samples_split': 4, 'splitter': 'random'}\n",
      "0.757847533632287\n",
      "--------------------\n"
     ]
    }
   ],
   "source": [
    "model_list = [\n",
    "    LogisticRegressionCV(),\n",
    "    RandomForestClassifier(), \n",
    "    DecisionTreeClassifier(),\n",
    "]\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2020)\n",
    "for num, model in enumerate(model_list):\n",
    "    gscv = GridSearchCV(model, param_grid=model_param_dict[num], cv=2, n_jobs=-1)\n",
    "    gscv.fit(x_train,y_train)\n",
    "    score = gscv.score(x_test,y_test)\n",
    "    print(model)\n",
    "    print(gscv.best_params_)\n",
    "    print(score)\n",
    "    print('-'*20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 去掉相关性比较差的特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = abs(train1.corr()['Survived']) > 0.02\n",
    "# 注意有些相关系数是小于0的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Survived            True\n",
       "Pclass              True\n",
       "Age                 True\n",
       "SibSp               True\n",
       "Parch               True\n",
       "Fare                True\n",
       "Sex_female          True\n",
       "Sex_male            True\n",
       "Embarked_C          True\n",
       "Embarked_Q         False\n",
       "Embarked_S          True\n",
       "New_name_Master     True\n",
       "New_name_Misc       True\n",
       "New_name_Miss       True\n",
       "New_name_Mr         True\n",
       "New_name_Mrs        True\n",
       "Name: Survived, dtype: bool"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.series.Series"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp_list = temp.index.to_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = [i for i in temp_list if temp[i]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Survived',\n",
       " 'Pclass',\n",
       " 'Age',\n",
       " 'SibSp',\n",
       " 'Parch',\n",
       " 'Fare',\n",
       " 'Sex_female',\n",
       " 'Sex_male',\n",
       " 'Embarked_C',\n",
       " 'Embarked_S',\n",
       " 'New_name_Master',\n",
       " 'New_name_Misc',\n",
       " 'New_name_Miss',\n",
       " 'New_name_Mr',\n",
       " 'New_name_Mrs']"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "train4 = train1[res]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>    0.820628\n",
      "<class 'sklearn.ensemble._forest.RandomForestClassifier'>        0.816143\n",
      "<class 'sklearn.naive_bayes.GaussianNB'>                         0.793722\n",
      "<class 'sklearn.naive_bayes.BernoulliNB'>                        0.766816\n",
      "<class 'sklearn.tree._classes.DecisionTreeClassifier'>           0.744395\n",
      "<class 'sklearn.svm._classes.SVC'>                               0.632287\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "res.pop(0)\n",
    "x4, y4 = train4[res], train4['Survived']\n",
    "\n",
    "\n",
    "def modeling(model_list, x, y):\n",
    "    d = {}\n",
    "    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2020)\n",
    "    for model in model_list:\n",
    "        model.fit(x_train, y_train)\n",
    "        score = model.score(x_test, y_test)\n",
    "#         print(f'{model.__class__} : {score}')\n",
    "        d[model.__class__] = score # 获得属性的名字\n",
    "    print(pd.Series(d).sort_values(ascending=False))\n",
    "\n",
    "modeling(MODEL, x4, y4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 总结"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**在前面部分中，对数据的处理有以下方法：**\n",
    "- 单独就是对定序等级的数据进行了one hot形式处理\n",
    "- 年龄和船票价格都是连续的。暂时没有离散化处理\n",
    "- 跑了下模型，进行了下网格搜索"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "网格搜索似乎只有一点点帮助，但效果没有很明显\n",
    "\n",
    "当去除掉相关性比较低的特征以后，随机森林的模型精确度反而上升了，其他的模型准确度略微有点下降，但模型的拟合速度得到了很大的提升"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 提升模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**接下来考虑创造一些特征，来提升模型的准确度。同时也会考虑模型融合**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 尝试归一化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "年龄和船票价格对其他变量的影响比较大。所以试一试将这两行数据进行归一化。看看模型的准确率有没有提升"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.452596Z",
     "start_time": "2020-08-05T13:41:07.448858Z"
    }
   },
   "outputs": [],
   "source": [
    "train2 = train1.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.464794Z",
     "start_time": "2020-08-05T13:41:07.454089Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train2 is train1\n",
    "\n",
    "# 这里注意和==的区别。is是判断两个变量指向的id是不是一样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.486200Z",
     "start_time": "2020-08-05T13:41:07.466372Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \\\n",
       "0         0       3  22.0      1      0   7.2500           0         1   \n",
       "1         1       1  38.0      1      0  71.2833           1         0   \n",
       "2         1       3  26.0      0      0   7.9250           1         0   \n",
       "3         1       1  35.0      1      0  53.1000           1         0   \n",
       "4         0       3  35.0      0      0   8.0500           0         1   \n",
       "\n",
       "   Embarked_C  Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  \\\n",
       "0           0           0           1                0              0   \n",
       "1           1           0           0                0              0   \n",
       "2           0           0           1                0              0   \n",
       "3           0           0           1                0              0   \n",
       "4           0           0           1                0              0   \n",
       "\n",
       "   New_name_Miss  New_name_Mr  New_name_Mrs  \n",
       "0              0            1             0  \n",
       "1              0            0             1  \n",
       "2              1            0             0  \n",
       "3              0            0             1  \n",
       "4              0            1             0  "
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.500751Z",
     "start_time": "2020-08-05T13:41:07.489149Z"
    }
   },
   "outputs": [],
   "source": [
    "min_max = MinMaxScaler()\n",
    "min_max_data = pd.DataFrame(min_max.fit_transform(train2[['Age', 'Fare']]), columns=['Age_scaler', 'Fare_scaler'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.513267Z",
     "start_time": "2020-08-05T13:41:07.503094Z"
    }
   },
   "outputs": [],
   "source": [
    "train2 = pd.concat([train2, min_max_data], axis=1).drop(['Age','Fare'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.540557Z",
     "start_time": "2020-08-05T13:41:07.515546Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>Age_scaler</th>\n",
       "      <th>Fare_scaler</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.271174</td>\n",
       "      <td>0.014151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.472229</td>\n",
       "      <td>0.139136</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.321438</td>\n",
       "      <td>0.015469</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.434531</td>\n",
       "      <td>0.103644</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.434531</td>\n",
       "      <td>0.015713</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass  SibSp  Parch  Sex_female  Sex_male  Embarked_C  \\\n",
       "0         0       3      1      0           0         1           0   \n",
       "1         1       1      1      0           1         0           1   \n",
       "2         1       3      0      0           1         0           0   \n",
       "3         1       1      1      0           1         0           0   \n",
       "4         0       3      0      0           0         1           0   \n",
       "\n",
       "   Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  New_name_Miss  \\\n",
       "0           0           1                0              0              0   \n",
       "1           0           0                0              0              0   \n",
       "2           0           1                0              0              1   \n",
       "3           0           1                0              0              0   \n",
       "4           0           1                0              0              0   \n",
       "\n",
       "   New_name_Mr  New_name_Mrs  Age_scaler  Fare_scaler  \n",
       "0            1             0    0.271174     0.014151  \n",
       "1            0             1    0.472229     0.139136  \n",
       "2            0             0    0.321438     0.015469  \n",
       "3            0             1    0.434531     0.103644  \n",
       "4            1             0    0.434531     0.015713  "
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:07.552409Z",
     "start_time": "2020-08-05T13:41:07.542689Z"
    }
   },
   "outputs": [],
   "source": [
    "x2, y2 = train2.drop('Survived', axis=1), train2['Survived']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:08.457380Z",
     "start_time": "2020-08-05T13:41:07.554550Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'sklearn.svm._classes.SVC'>                               0.834081\n",
      "<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>    0.807175\n",
      "<class 'sklearn.ensemble._forest.RandomForestClassifier'>        0.807175\n",
      "<class 'sklearn.naive_bayes.GaussianNB'>                         0.798206\n",
      "<class 'sklearn.naive_bayes.BernoulliNB'>                        0.766816\n",
      "<class 'sklearn.tree._classes.DecisionTreeClassifier'>           0.753363\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "def modeling(model_list, x, y):\n",
    "    d = {}\n",
    "    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2020)\n",
    "    for model in model_list:\n",
    "        model.fit(x_train, y_train)\n",
    "        score = model.score(x_test, y_test)\n",
    "#         print(f'{model.__class__} : {score}')\n",
    "        d[model.__class__] = score # 获得属性的名字\n",
    "    print(pd.Series(d).sort_values(ascending=False))\n",
    "\n",
    "modeling(MODEL, x2, y2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>    0.825112\n",
    "\n",
    "<class 'sklearn.ensemble._forest.RandomForestClassifier'>        0.816143\n",
    "\n",
    "<class 'sklearn.naive_bayes.GaussianNB'>                         0.798206\n",
    "\n",
    "<class 'sklearn.naive_bayes.BernoulliNB'>                        0.766816\n",
    "\n",
    "<class 'sklearn.tree._classes.DecisionTreeClassifier'>           0.753363\n",
    "\n",
    "<class 'sklearn.svm._classes.SVC'>                               0.632287"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**数据进行归一化后，svm的效果更好点。变化很大。我们单独对svm进行网格搜索。看看模型上限是多少**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:08.467713Z",
     "start_time": "2020-08-05T13:41:08.458943Z"
    }
   },
   "outputs": [],
   "source": [
    "grid_param = {\n",
    "    'C':np.linspace(0,2,20),\n",
    "    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],\n",
    "    'degree':[i for i in range(5)],\n",
    "    'gamma':['scale', 'auto'],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:08.482662Z",
     "start_time": "2020-08-05T13:41:08.469273Z"
    }
   },
   "outputs": [],
   "source": [
    "x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size = 0.3, random_state=2020)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:41.711240Z",
     "start_time": "2020-08-05T13:41:08.484253Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8395522388059702\n",
      "{'C': 0.8421052631578947, 'degree': 0, 'gamma': 'scale', 'kernel': 'rbf'}\n"
     ]
    }
   ],
   "source": [
    "grid_search = GridSearchCV(SVC(), param_grid=grid_param, n_jobs=4)\n",
    "grid_search.fit(x2_train, y2_train)\n",
    "score = grid_search.score(x2_test, y2_test)\n",
    "print(score)\n",
    "print(grid_search.best_params_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**模型精度只提升了一点点**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 连续数据离散化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TODO:\n",
    "\n",
    "1. 尝试创建新的特征\n",
    "2. 对年龄和票价进行分类处理。设置成定序等级"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:41.729344Z",
     "start_time": "2020-08-05T13:41:41.712783Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>New_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "      <td>Miss</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch     Fare Embarked New_name  \n",
       "0      0   7.2500        S       Mr  \n",
       "1      0  71.2833        C      Mrs  \n",
       "2      0   7.9250        S     Miss  \n",
       "3      0  53.1000        S      Mrs  \n",
       "4      0   8.0500        S       Mr  "
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:41.755274Z",
     "start_time": "2020-08-05T13:41:41.731832Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \\\n",
       "0         0       3  22.0      1      0   7.2500           0         1   \n",
       "1         1       1  38.0      1      0  71.2833           1         0   \n",
       "2         1       3  26.0      0      0   7.9250           1         0   \n",
       "3         1       1  35.0      1      0  53.1000           1         0   \n",
       "4         0       3  35.0      0      0   8.0500           0         1   \n",
       "\n",
       "   Embarked_C  Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  \\\n",
       "0           0           0           1                0              0   \n",
       "1           1           0           0                0              0   \n",
       "2           0           0           1                0              0   \n",
       "3           0           0           1                0              0   \n",
       "4           0           0           1                0              0   \n",
       "\n",
       "   New_name_Miss  New_name_Mr  New_name_Mrs  \n",
       "0              0            1             0  \n",
       "1              0            0             1  \n",
       "2              1            0             0  \n",
       "3              0            0             1  \n",
       "4              0            1             0  "
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们试试将年龄和性别按区间分好。设置成定序数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:41.981769Z",
     "start_time": "2020-08-05T13:41:41.757723Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([ 54.,  46., 177., 346., 118.,  70.,  45.,  24.,   9.,   2.]),\n",
       " array([ 0.42 ,  8.378, 16.336, 24.294, 32.252, 40.21 , 48.168, 56.126,\n",
       "        64.084, 72.042, 80.   ]),\n",
       " <BarContainer object of 10 artists>)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAARsUlEQVR4nO3df6xl5V3v8fdHQFqpKSDnTsaZiYerow0aO3CPSNPGVLAWqHEw0QZilDQkownNbW+ae++giVojCU20aBMlGQU7Nb1Q7A+ZUPyBUxJTk4IHOqUzTLkdy1RmMjCnP6CtjaTQr3/sZ+x2ODPnxz777D2P71eystd61lp7fc9Z63zOOs9ea51UFZKkvnzXpAuQJK09w12SOmS4S1KHDHdJ6pDhLkkdOnvSBQBcdNFFNTs7O+kyJOmM8uijj36pqmYWmzcV4T47O8v8/Pyky5CkM0qSL55qnt0yktQhw12SOmS4S1KHlgz3JK9I8kiSzyQ5kOTdrf39SZ5Ksq8N21p7krwvyaEkjye5bMxfgyTpJMv5QPUF4Mqq+kaSc4BPJvnrNu9/V9WHT1r+GmBrG34SuKO9SpLWyZJn7jXwjTZ5ThtO97Sx7cAH2nqfAs5PsnH0UiVJy7WsPvckZyXZBxwHHqyqh9usW1vXy+1Jzm1tm4Cnh1Y/0tpOfs8dSeaTzC8sLKz+K5Akvcyywr2qXqqqbcBm4PIkPwbcArwG+AngQuD/rmTDVbWrquaqam5mZtFr8CVJq7Siq2Wq6jngIeDqqjrWul5eAP4cuLwtdhTYMrTa5tYmSVonS36gmmQG+FZVPZfklcCbgPck2VhVx5IEuA7Y31bZA7w9yT0MPkh9vqqOjad8rbfZnR+fyHYP3/aWiWxXOlMt52qZjcDuJGcxONO/t6ruT/KJFvwB9gG/3pZ/ALgWOAR8E3jbmlctSTqtJcO9qh4HLl2k/cpTLF/AzaOXJklaLe9QlaQOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SerQkuGe5BVJHknymSQHkry7tV+c5OEkh5J8KMl3t/Zz2/ShNn92zF+DJOkkyzlzfwG4sqpeC2wDrk5yBfAe4Paq+iHgq8BNbfmbgK+29tvbcpKkdbRkuNfAN9rkOW0o4Ergw619N3BdG9/epmnzr0qStSpYkrS0ZfW5JzkryT7gOPAg8M/Ac1X1YlvkCLCpjW8CngZo858Hvm+R99yRZD7J/MLCwkhfhCTpP1tWuFfVS1W1DdgMXA68ZtQNV9WuqpqrqrmZmZlR306SNGRFV8tU1XPAQ8DrgPOTnN1mbQaOtvGjwBaANv/VwJfXolhJ0vIs52qZmSTnt/FXAm8CDjII+V9si90I3NfG97Rp2vxPVFWtYc2SpCWcvfQibAR2JzmLwS+De6vq/iRPAPck+T3g08Cdbfk7gb9Icgj4CnD9GOqWJJ3GkuFeVY8Dly7S/gUG/e8nt/8b8EtrUp0kaVW8Q1WSOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ0uGe5ItSR5K8kSSA0ne0dp/J8nRJPvacO3QOrckOZTkySRvHucXIEl6ubOXscyLwLuq6rEk3ws8muTBNu/2qvr94YWTXAJcD/wo8P3A3yf54ap6aS0LlySd2pJn7lV1rKoea+NfBw4Cm06zynbgnqp6oaqeAg4Bl69FsZKk5VlRn3uSWeBS4OHW9PYkjye5K8kFrW0T8PTQakdY5JdBkh1J5pPMLywsrLxySdIpLTvck7wK+Ajwzqr6GnAH8IPANuAY8Acr2XBV7aqquaqam5mZWcmqkqQlLCvck5zDINg/WFUfBaiqZ6vqpar6NvCnfKfr5SiwZWj1za1NkrROlnO1TIA7gYNV9d6h9o1Di/0CsL+N7wGuT3JukouBrcAja1eyJGkpy7la5vXArwCfTbKvtf0GcEOSbUABh4FfA6iqA0nuBZ5gcKXNzV4pI0nra8lwr6pPAllk1gOnWedW4NYR6pIkjcA7VCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6tGS4J9mS5KEkTyQ5kOQdrf3CJA8m+Xx7vaC1J8n7khxK8niSy8b9RUiS/rPlnLm/CLyrqi4BrgBuTnIJsBPYW1Vbgb1tGuAaYGsbdgB3rHnVkqTTWjLcq+pYVT3Wxr8OHAQ2AduB3W2x3cB1bXw78IEa+BRwfpKNa124JOnUVtTnnmQWuBR4GNhQVcfarGeADW18E/D00GpHWtvJ77UjyXyS+YWFhZXWLUk6jWWHe5JXAR8B3llVXxueV1UF1Eo2XFW7qmququZmZmZWsqokaQnLCvck5zAI9g9W1Udb87Mnulva6/HWfhTYMrT65tYmSVony7laJsCdwMGqeu/QrD3AjW38RuC+ofZfbVfNXAE8P9R9I0laB2cvY5nXA78CfDbJvtb2G8BtwL1JbgK+CLy1zXsAuBY4BHwTeNtaFixJWtqS4V5VnwRyitlXLbJ8ATePWJckaQTeoSpJHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nq0HLuUNWUmd358UmXIGnKeeYuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4tGe5J7kpyPMn+obbfSXI0yb42XDs075Ykh5I8meTN4ypcknRqyzlzfz9w9SLtt1fVtjY8AJDkEuB64EfbOn+S5Ky1KlaStDxLhntV/QPwlWW+33bgnqp6oaqeAg4Bl49QnyRpFUbpc397ksdbt80FrW0T8PTQMkda28sk2ZFkPsn8wsLCCGVIkk622nC/A/hBYBtwDPiDlb5BVe2qqrmqmpuZmVllGZKkxawq3Kvq2ap6qaq+Dfwp3+l6OQpsGVp0c2uTJK2jVYV7ko1Dk78AnLiSZg9wfZJzk1wMbAUeGa1ESdJKLfk/VJPcDbwRuCjJEeC3gTcm2QYUcBj4NYCqOpDkXuAJ4EXg5qp6aSyVS5JOaclwr6obFmm+8zTL3wrcOkpRkqTReIeqJHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtSh5Z8KqQ0DWZ3fnxi2z5821smtm1ptTxzl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ0uGe5K7khxPsn+o7cIkDyb5fHu9oLUnyfuSHEryeJLLxlm8JGlxyzlzfz9w9UltO4G9VbUV2NumAa4BtrZhB3DH2pQpSVqJJcO9qv4B+MpJzduB3W18N3DdUPsHauBTwPlJNq5RrZKkZVptn/uGqjrWxp8BNrTxTcDTQ8sdaW0vk2RHkvkk8wsLC6ssQ5K0mJE/UK2qAmoV6+2qqrmqmpuZmRm1DEnSkNWG+7Mnulva6/HWfhTYMrTc5tYmSVpHqw33PcCNbfxG4L6h9l9tV81cATw/1H0jSVonSz7yN8ndwBuBi5IcAX4buA24N8lNwBeBt7bFHwCuBQ4B3wTeNoaaJUlLWDLcq+qGU8y6apFlC7h51KIkSaPxDlVJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdWjJB4dJ/9XN7vz4RLZ7+La3TGS76oNn7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOjXQTU5LDwNeBl4AXq2ouyYXAh4BZ4DDw1qr66mhlntqkbjABbzKRNL3W4sz9p6tqW1XNtemdwN6q2grsbdOSpHU0jm6Z7cDuNr4buG4M25Akncao4V7A3yV5NMmO1rahqo618WeADYutmGRHkvkk8wsLCyOWIUkaNuqDw95QVUeT/DfgwSSfG55ZVZWkFluxqnYBuwDm5uYWXUaStDojhXtVHW2vx5N8DLgceDbJxqo6lmQjcHwN6pxKk/wwV5JOZ9XhnuQ84Luq6utt/GeB3wX2ADcCt7XX+9aiUOm/Gh81rFGMcua+AfhYkhPv8/+q6m+S/BNwb5KbgC8Cbx29TEnSSqw63KvqC8BrF2n/MnDVKEVJkkbjHaqS1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktShUZ8KKakz/nezPnjmLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQNzFJmhr+U/C145m7JHXIcJekDo0t3JNcneTJJIeS7BzXdiRJLzeWcE9yFvDHwDXAJcANSS4Zx7YkSS83rg9ULwcOVdUXAJLcA2wHnhjT9iRp1Xp8Eua4wn0T8PTQ9BHgJ4cXSLID2NEmv5HkyVVs5yLgS6uqcLysa+WmtTbrWplprQumtLa8Z6S6fuBUMyZ2KWRV7QJ2jfIeSearam6NSloz1rVy01qbda3MtNYF01vbuOoa1weqR4EtQ9ObW5skaR2MK9z/Cdia5OIk3w1cD+wZ07YkSScZS7dMVb2Y5O3A3wJnAXdV1YExbGqkbp0xsq6Vm9barGtlprUumN7axlJXqmoc7ytJmiDvUJWkDhnuktShMzLcp+nRBknuSnI8yf6htguTPJjk8+31ggnUtSXJQ0meSHIgyTumobYkr0jySJLPtLre3dovTvJw26cfah/Er7skZyX5dJL7p6yuw0k+m2RfkvnWNg3H2flJPpzkc0kOJnndpOtK8iPt+3Ri+FqSd066rlbb/2rH/f4kd7efh7EcY2dcuE/how3eD1x9UttOYG9VbQX2tun19iLwrqq6BLgCuLl9nyZd2wvAlVX1WmAbcHWSK4D3ALdX1Q8BXwVuWue6TngHcHBoelrqAvjpqto2dE30pPclwB8Bf1NVrwFey+B7N9G6qurJ9n3aBvwP4JvAxyZdV5JNwP8E5qrqxxhcbHI94zrGquqMGoDXAX87NH0LcMuEa5oF9g9NPwlsbOMbgSen4Pt2H/CmaaoN+B7gMQZ3L38JOHuxfbyO9Wxm8EN/JXA/kGmoq237MHDRSW0T3ZfAq4GnaBdmTEtdJ9Xys8A/TkNdfOfO/QsZXKl4P/DmcR1jZ9yZO4s/2mDThGo5lQ1VdayNPwNsmGQxSWaBS4GHmYLaWtfHPuA48CDwz8BzVfViW2RS+/QPgf8DfLtNf9+U1AVQwN8lebQ9ugMmvy8vBhaAP29dWX+W5LwpqGvY9cDdbXyidVXVUeD3gX8BjgHPA48ypmPsTAz3M0oNfh1P7HrTJK8CPgK8s6q+NjxvUrVV1Us1+JN5M4OHzL1mvWs4WZKfA45X1aOTruUU3lBVlzHojrw5yU8Nz5zQvjwbuAy4o6ouBf6Vk7o6Jnn8t77rnwf+8uR5k6ir9fFvZ/BL8fuB83h5l+6aORPD/Ux4tMGzSTYCtNfjkygiyTkMgv2DVfXRaaoNoKqeAx5i8Kfo+UlO3FQ3iX36euDnkxwG7mHQNfNHU1AX8B9nfVTVcQb9x5cz+X15BDhSVQ+36Q8zCPtJ13XCNcBjVfVsm550XT8DPFVVC1X1LeCjDI67sRxjZ2K4nwmPNtgD3NjGb2TQ372ukgS4EzhYVe+dltqSzCQ5v42/ksHnAAcZhPwvTqquqrqlqjZX1SyDY+oTVfXLk64LIMl5Sb73xDiDfuT9THhfVtUzwNNJfqQ1XcXgsd4TP/6bG/hOlwxMvq5/Aa5I8j3t5/PE92s8x9ikPugY8YOJa4H/z6Cv9jcnXMvdDPrPvsXgTOYmBn21e4HPA38PXDiBut7A4M/Ox4F9bbh20rUBPw58utW1H/it1v7fgUeAQwz+jD53gvv0jcD901JXq+EzbThw4pif9L5sNWwD5tv+/Cvggimp6zzgy8Crh9qmoa53A59rx/5fAOeO6xjz8QOS1KEzsVtGkrQEw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR16N8BQZBZmXkrZTkAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(train1.Age)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.188461Z",
     "start_time": "2020-08-05T13:41:41.983691Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([732., 106.,  31.,   2.,  11.,   6.,   0.,   0.,   0.,   3.]),\n",
       " array([  0.     ,  51.23292, 102.46584, 153.69876, 204.93168, 256.1646 ,\n",
       "        307.39752, 358.63044, 409.86336, 461.09628, 512.3292 ]),\n",
       " <BarContainer object of 10 artists>)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAARKElEQVR4nO3df6zddX3H8edrVNChsfy4Nk3brBgbDX+Mym5YjWZRiAbQWP5QIjGjIU26P9iC0UTLlmwx2R/4jyjJQtaIsyxORJTQIFG7gln2B+hFkF+VcSWQtin0yqBOiW7oe3+cT/VQ295ze38c7qfPR3JyPt/353Pu9/Ohh1e//dzvuTdVhSSpL3807glIkhae4S5JHTLcJalDhrskdchwl6QOrRj3BADOPffcWr9+/binIUnLyoMPPvizqpo4Vt9rItzXr1/P1NTUuKchSctKkmeP1+e2jCR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdeg18QnV+Vi//dtjO/czN3xwbOeWpBPxyl2SOmS4S1KHDHdJ6pDhLkkdMtwlqUOzhnuStyd5eOjx8ySfSHJ2kt1JnmrPZ7XxSXJTkukkjyS5cPGXIUkaNmu4V9WTVbWxqjYCfwa8DNwJbAf2VNUGYE87BrgM2NAe24CbF2HekqQTmOu2zCXAT6vqWWAzsLPVdwJXtPZm4NYauB9YmWT1QkxWkjSauYb7x4CvtfaqqjrY2s8Bq1p7DbBv6DX7W+1VkmxLMpVkamZmZo7TkCSdyMjhnuR04MPAN47uq6oCai4nrqodVTVZVZMTE8f8/a6SpJM0lyv3y4AfVdXz7fj5I9st7flQqx8A1g29bm2rSZKWyFzC/Sp+vyUDsAvY0tpbgLuG6le3u2Y2AYeHtm8kSUtgpB8cluRM4P3AXw2VbwBuT7IVeBa4stXvAS4HphncWXPNgs1WkjSSkcK9qn4JnHNU7QUGd88cPbaAaxdkdpKkk+InVCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOjRTuSVYmuSPJT5LsTfKuJGcn2Z3kqfZ8VhubJDclmU7ySJILF3cJkqSjjXrl/kXgO1X1DuACYC+wHdhTVRuAPe0Y4DJgQ3tsA25e0BlLkmY1a7gneTPwF8AtAFX1v1X1ErAZ2NmG7QSuaO3NwK01cD+wMsnqBZ63JOkERrlyPw+YAf4lyUNJvpTkTGBVVR1sY54DVrX2GmDf0Ov3t9qrJNmWZCrJ1MzMzMmvQJL0B0YJ9xXAhcDNVfVO4Jf8fgsGgKoqoOZy4qraUVWTVTU5MTExl5dKkmYxSrjvB/ZX1QPt+A4GYf/8ke2W9nyo9R8A1g29fm2rSZKWyKzhXlXPAfuSvL2VLgGeAHYBW1ptC3BXa+8Crm53zWwCDg9t30iSlsCKEcf9DfDVJKcDTwPXMPiL4fYkW4FngSvb2HuAy4Fp4OU2VpK0hEYK96p6GJg8RtclxxhbwLXzm5YkaT78hKokdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUoZHCPckzSR5N8nCSqVY7O8nuJE+157NaPUluSjKd5JEkFy7mAiRJf2guV+7vq6qNVXXkF2VvB/ZU1QZgTzsGuAzY0B7bgJsXarKSpNHMZ1tmM7CztXcCVwzVb62B+4GVSVbP4zySpDkaNdwL+F6SB5Nsa7VVVXWwtZ8DVrX2GmDf0Gv3t9qrJNmWZCrJ1MzMzElMXZJ0PCtGHPeeqjqQ5C3A7iQ/Ge6sqkpSczlxVe0AdgBMTk7O6bWSpBMb6cq9qg6050PAncBFwPNHtlva86E2/ACwbujla1tNkrREZg33JGcmedORNvAB4DFgF7ClDdsC3NXau4Cr210zm4DDQ9s3kqQlMMq2zCrgziRHxv9bVX0nyQ+B25NsBZ4Frmzj7wEuB6aBl4FrFnzWkqQTmjXcq+pp4IJj1F8ALjlGvYBrF2R2kqST4idUJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA6NHO5JTkvyUJK72/F5SR5IMp3k60lOb/Uz2vF061+/SHOXJB3HXK7crwP2Dh1/Drixqt4GvAhsbfWtwIutfmMbJ0laQiOFe5K1wAeBL7XjABcDd7QhO4ErWntzO6b1X9LGS5KWyKhX7l8APg38th2fA7xUVa+04/3AmtZeA+wDaP2H23hJ0hKZNdyTfAg4VFUPLuSJk2xLMpVkamZmZiG/tCSd8ka5cn838OEkzwC3MdiO+SKwMsmKNmYtcKC1DwDrAFr/m4EXjv6iVbWjqiaranJiYmJei5Akvdqs4V5V11fV2qpaD3wMuLeqPg7cB3ykDdsC3NXau9oxrf/eqqoFnbUk6YTmc5/7Z4BPJplmsKd+S6vfApzT6p8Ets9vipKkuVox+5Dfq6rvA99v7aeBi44x5lfARxdgbpKkk+QnVCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1KFZwz3J65P8IMmPkzye5LOtfl6SB5JMJ/l6ktNb/Yx2PN361y/yGiRJRxnlyv3XwMVVdQGwEbg0ySbgc8CNVfU24EVgaxu/FXix1W9s4yRJS2jWcK+BX7TD17VHARcDd7T6TuCK1t7cjmn9lyTJQk1YkjS7kfbck5yW5GHgELAb+CnwUlW90obsB9a09hpgH0DrPwycc4yvuS3JVJKpmZmZeS1CkvRqI4V7Vf2mqjYCa4GLgHfM98RVtaOqJqtqcmJiYr5fTpI0ZE53y1TVS8B9wLuAlUlWtK61wIHWPgCsA2j9bwZeWIjJSpJGM8rdMhNJVrb2G4D3A3sZhPxH2rAtwF2tvasd0/rvrapawDlLkmaxYvYhrAZ2JjmNwV8Gt1fV3UmeAG5L8o/AQ8AtbfwtwL8mmQb+G/jYIsxbknQCs4Z7VT0CvPMY9acZ7L8fXf8V8NEFmZ0k6aT4CVVJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ7OGe5J1Se5L8kSSx5Nc1+pnJ9md5Kn2fFarJ8lNSaaTPJLkwsVehCTp1Ua5cn8F+FRVnQ9sAq5Ncj6wHdhTVRuAPe0Y4DJgQ3tsA25e8FlLkk5o1nCvqoNV9aPW/h9gL7AG2AzsbMN2Ale09mbg1hq4H1iZZPVCT1ySdHxz2nNPsh54J/AAsKqqDrau54BVrb0G2Df0sv2tdvTX2pZkKsnUzMzMXOctSTqBkcM9yRuBbwKfqKqfD/dVVQE1lxNX1Y6qmqyqyYmJibm8VJI0i5HCPcnrGAT7V6vqW638/JHtlvZ8qNUPAOuGXr621SRJS2SUu2UC3ALsrarPD3XtAra09hbgrqH61e2umU3A4aHtG0nSElgxwph3A38JPJrk4Vb7W+AG4PYkW4FngStb3z3A5cA08DJwzUJOWJI0u1nDvar+E8hxui85xvgCrp3nvCRJ8+AnVCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOzRruSb6c5FCSx4ZqZyfZneSp9nxWqyfJTUmmkzyS5MLFnLwk6dhGuXL/CnDpUbXtwJ6q2gDsaccAlwEb2mMbcPPCTFOSNBcrZhtQVf+RZP1R5c3Ae1t7J/B94DOtfmtVFXB/kpVJVlfVwQWb8WvI+u3fHst5n7nhg2M5r6Tl42T33FcNBfZzwKrWXgPsGxq3v9UkSUto3t9QbVfpNdfXJdmWZCrJ1MzMzHynIUkacrLh/nyS1QDt+VCrHwDWDY1b22p/oKp2VNVkVU1OTEyc5DQkScdysuG+C9jS2luAu4bqV7e7ZjYBh3vdb5ek17JZv6Ga5GsMvnl6bpL9wD8ANwC3J9kKPAtc2YbfA1wOTAMvA9cswpwlSbMY5W6Zq47TdckxxhZw7XwnJUmaHz+hKkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6NOuP/NVrz7h+MTf4y7ml5cIrd0nqkOEuSR0y3CWpQ4a7JHXIcJekDi3K3TJJLgW+CJwGfKmqbliM8+jUMa47hLw7SMvVgod7ktOAfwLeD+wHfphkV1U9sdDnkhabt51quVqMK/eLgOmqehogyW3AZsBw78A4w05aLD3+Jb4Y4b4G2Dd0vB/486MHJdkGbGuHv0jy5Eme71zgZyf52uXoVFrvqbRWOGq9+dwYZ7L4TqU/2xOudZ5/zn9yvI6xfUK1qnYAO+b7dZJMVdXkAkxpWTiV1nsqrRVOrfW61sW3GHfLHADWDR2vbTVJ0hJZjHD/IbAhyXlJTgc+BuxahPNIko5jwbdlquqVJH8NfJfBrZBfrqrHF/o8Q+a9tbPMnErrPZXWCqfWel3rIktVjeO8kqRF5CdUJalDhrskdWhZh3uSS5M8mWQ6yfZxz2e+knw5yaEkjw3Vzk6yO8lT7fmsVk+Sm9raH0ly4fhmPndJ1iW5L8kTSR5Pcl2r97re1yf5QZIft/V+ttXPS/JAW9fX200IJDmjHU+3/vVjXcBJSHJakoeS3N2Oe17rM0keTfJwkqlWG+t7edmG+9CPObgMOB+4Ksn5453VvH0FuPSo2nZgT1VtAPa0Yxise0N7bANuXqI5LpRXgE9V1fnAJuDa9ufX63p/DVxcVRcAG4FLk2wCPgfcWFVvA14EtrbxW4EXW/3GNm65uQ7YO3Tc81oB3ldVG4fuaR/ve7mqluUDeBfw3aHj64Hrxz2vBVjXeuCxoeMngdWtvRp4srX/GbjqWOOW4wO4i8HPI+p+vcAfAz9i8MntnwErWv1372kGd5u9q7VXtHEZ99znsMa1DALtYuBuIL2utc37GeDco2pjfS8v2yt3jv1jDtaMaS6LaVVVHWzt54BVrd3N+ts/w98JPEDH623bFA8Dh4DdwE+Bl6rqlTZkeE2/W2/rPwycs6QTnp8vAJ8GftuOz6HftQIU8L0kD7YfrQJjfi/7C7KXkaqqJF3du5rkjcA3gU9U1c+T/K6vt/VW1W+AjUlWAncC7xjvjBZHkg8Bh6rqwSTvHfN0lsp7qupAkrcAu5P8ZLhzHO/l5Xzlfqr8mIPnk6wGaM+HWn3Zrz/J6xgE+1er6lut3O16j6iql4D7GGxNrExy5CJreE2/W2/rfzPwwtLO9KS9G/hwkmeA2xhszXyRPtcKQFUdaM+HGPzFfRFjfi8v53A/VX7MwS5gS2tvYbA3faR+dfvO+ybg8NA/AV/zMrhEvwXYW1WfH+rqdb0T7YqdJG9g8P2FvQxC/iNt2NHrPfLf4SPAvdU2aF/rqur6qlpbVesZ/H95b1V9nA7XCpDkzCRvOtIGPgA8xrjfy+P+RsQ8v4lxOfBfDPYu/27c81mA9XwNOAj8H4N9uK0M9h73AE8B/w6c3caGwd1CPwUeBSbHPf85rvU9DPYpHwEebo/LO17vnwIPtfU+Bvx9q78V+AEwDXwDOKPVX9+Op1v/W8e9hpNc93uBu3tea1vXj9vj8SNZNO73sj9+QJI6tJy3ZSRJx2G4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA79P/hRDmLVWiVVAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(train1.Fare)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fare有很严重的不均衡情况。即某类的数据很多。其他几类数据很少"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "****\n",
    "**cut and qcut**\n",
    "\n",
    "cut:根据等距离原则，将每个数据进行分类\n",
    "\n",
    "qcut:根据数据出现的频率进行区间划分，尽量保证每个区间的数据量相同"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 在下面数据处理中，我们对年龄进行cut划分。我们认为10-20为少年，20-30为青年这样\n",
    "- 对于不是上面类型的数据。我们使用qcut。使样本更均衡"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.205607Z",
     "start_time": "2020-08-05T13:41:42.194032Z"
    }
   },
   "outputs": [],
   "source": [
    "train1['FareBin'] = pd.qcut(train1.Fare,5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.215801Z",
     "start_time": "2020-08-05T13:41:42.209318Z"
    }
   },
   "outputs": [],
   "source": [
    "train1['AgeBin'] = pd.cut(train1.Age, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.513372Z",
     "start_time": "2020-08-05T13:41:42.217076Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:>"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAFCCAYAAAAKd53gAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAg/klEQVR4nO3deZhlVX3u8e8LzSCDjC1gQ9sEMIFcFaRFjBoFREASITfAoygCQtobNWgwie2QiElU8JqoRCDpCNhGBHFguMJlRkyuF+huBGSmJUzN1KgMiggNb/7Yq6hDpaamu846vff7eZ56ap+1d3X96vSpt/ZZe+21ZJuIiGiX1WoXEBERK1/CPSKihRLuEREtlHCPiGihhHtERAtNq10AwKabbupZs2bVLiMiYpWyaNGih21PH23fQIT7rFmzWLhwYe0yIiJWKZLuGmtfumUiIloo4R4R0UIJ94iIFkq4R0S0UMI9IqKFEu4RES2UcI+IaKGEe0RECyXcIyJaaCDuUF0ZZs09r3YJ3HnsvrVLiIgAcuYeEdFKCfeIiBZKuEdEtFDCPSKihRLuEREtNKlwl3SnpJ9IulbSwtK2saSLJd1ePm9U2iXpeEmLJV0v6dVT+QNERMR/tzxn7rvZ3tH27PJ4LnCp7e2AS8tjgH2A7crHHOCklVVsRERMzop0y+wHzC/b84H9e9q/7saVwIaStliB7xMREctpsuFu4CJJiyTNKW2b2b6/bD8AbFa2ZwD39HztvaXteSTNkbRQ0sKlS5e+gNIjImIsk71D9Q22l0h6CXCxpFt6d9q2JC/PN7Y9D5gHMHv27OX62oiIGN+kztxtLymfHwLOAnYBHhzqbimfHyqHLwG26vnyLUtbRET0yYThLmldSesPbQNvBW4AzgUOLYcdCpxTts8F3lNGzewKPNrTfRMREX0wmW6ZzYCzJA0d/03bF0haAJwp6QjgLuCgcvz5wNuAxcATwOErveqIiBjXhOFu+w7gVaO0/wzYY5R2Ax9YKdVFRMQLkjtUIyJaKOEeEdFCCfeIiBZKuEdEtFDCPSKihRLuEREtlHCPiGihhHtERAsl3CMiWijhHhHRQgn3iIgWSrhHRLRQwj0iooUS7hERLZRwj4hooYR7REQLJdwjIloo4R4R0UIJ94iIFkq4R0S0UMI9IqKFEu4RES2UcI+IaKGEe0RECyXcIyJaKOEeEdFCCfeIiBZKuEdEtFDCPSKihRLuEREtNOlwl7S6pB9L+n55vLWkqyQtlvQtSWuW9rXK48Vl/6wpqj0iIsawPGfuHwJu7nl8HPBF29sCvwCOKO1HAL8o7V8sx0VERB9NKtwlbQnsC3y1PBawO/Cdcsh8YP+yvV95TNm/Rzk+IiL6ZLJn7l8C/gp4tjzeBHjE9rLy+F5gRtmeAdwDUPY/Wo6PiIg+mTDcJf0B8JDtRSvzG0uaI2mhpIVLly5dmf90RETnTebM/fXA2yXdCZxB0x3zZWBDSdPKMVsCS8r2EmArgLJ/A+BnI/9R2/Nsz7Y9e/r06Sv0Q0RExPNNGO62P2Z7S9uzgHcAl9l+F3A5cEA57FDgnLJ9bnlM2X+Zba/UqiMiYlwrMs79o8DRkhbT9KmfXNpPBjYp7UcDc1esxIiIWF7TJj5kmO0fAD8o23cAu4xyzJPAgSuhtoiIeIFyh2pERAsl3CMiWijhHhHRQgn3iIgWSrhHRLRQwj0iooUS7hERLZRwj4hooYR7REQLJdwjIloo4R4R0UIJ94iIFkq4R0S0UMI9IqKFEu4RES2UcI+IaKGEe0RECyXcIyJaKOEeEdFCCfeIiBZKuEdEtFDCPSKihRLuEREtlHCPiGihhHtERAsl3CMiWijhHhHRQgn3iIgWSrhHRLRQwj0iooUS7hERLTRhuEtaW9LVkq6TdKOkT5f2rSVdJWmxpG9JWrO0r1UeLy77Z03xzxARESNM5sz9N8Dutl8F7AjsLWlX4Djgi7a3BX4BHFGOPwL4RWn/YjkuIiL6aMJwd+OX5eEa5cPA7sB3Svt8YP+yvV95TNm/hyStrIIjImJik+pzl7S6pGuBh4CLgZ8Cj9heVg65F5hRtmcA9wCU/Y8Cm4zyb86RtFDSwqVLl67QDxEREc83qXC3/YztHYEtgV2A31nRb2x7nu3ZtmdPnz59Rf+5iIjosVyjZWw/AlwOvA7YUNK0smtLYEnZXgJsBVD2bwD8bGUUGxERkzOZ0TLTJW1Ytl8E7AncTBPyB5TDDgXOKdvnlseU/ZfZ9kqsOSIiJjBt4kPYApgvaXWaPwZn2v6+pJuAMyT9PfBj4ORy/MnAv0laDPwceMcU1B0REeOYMNxtXw/sNEr7HTT97yPbnwQOXCnVRUTEC5I7VCMiWijhHhHRQgn3iIgWSrhHRLRQwj0iooUS7hERLZRwj4hooYR7REQLJdwjIloo4R4R0UIJ94iIFkq4R0S0UMI9IqKFEu4RES2UcI+IaKGEe0RECyXcIyJaKOEeEdFCCfeIiBZKuEdEtFDCPSKihRLuEREtlHCPiGihhHtERAsl3CMiWijhHhHRQgn3iIgWSrhHRLRQwj0iooUS7hERLTRhuEvaStLlkm6SdKOkD5X2jSVdLOn28nmj0i5Jx0taLOl6Sa+e6h8iIiKebzJn7suAj9jeAdgV+ICkHYC5wKW2twMuLY8B9gG2Kx9zgJNWetURETGuCcPd9v22rynbjwM3AzOA/YD55bD5wP5lez/g625cCWwoaYuVXXhERIxtufrcJc0CdgKuAjazfX/Z9QCwWdmeAdzT82X3lraIiOiTSYe7pPWA7wIftv1Y7z7bBrw831jSHEkLJS1cunTp8nxpRERMYFLhLmkNmmA/zfb3SvODQ90t5fNDpX0JsFXPl29Z2p7H9jzbs23Pnj59+gutPyIiRjGZ0TICTgZutv2PPbvOBQ4t24cC5/S0v6eMmtkVeLSn+yYiIvpg2iSOeT1wCPATSdeWto8DxwJnSjoCuAs4qOw7H3gbsBh4Ajh8ZRYcERETmzDcbf8HoDF27zHK8QY+sIJ1xQqYNfe82iVw57H71i4hotNyh2pERAsl3CMiWijhHhHRQgn3iIgWSrhHRLRQwj0iooUS7hERLZRwj4hooYR7REQLJdwjIloo4R4R0UIJ94iIFkq4R0S0UMI9IqKFEu4RES2UcI+IaKGEe0RECyXcIyJaKOEeEdFCCfeIiBZKuEdEtFDCPSKihRLuEREtlHCPiGihhHtERAsl3CMiWijhHhHRQgn3iIgWSrhHRLRQwj0iooUS7hERLTRhuEs6RdJDkm7oadtY0sWSbi+fNyrtknS8pMWSrpf06qksPiIiRjeZM/evAXuPaJsLXGp7O+DS8hhgH2C78jEHOGnllBkREctjwnC3/UPg5yOa9wPml+35wP497V9340pgQ0lbrKRaIyJikqa9wK/bzPb9ZfsBYLOyPQO4p+e4e0vb/YwgaQ7N2T0zZ858gWVEjG/W3PNql8Cdx+5bu4TooBW+oGrbgF/A182zPdv27OnTp69oGRER0eOFhvuDQ90t5fNDpX0JsFXPcVuWtoiI6KMXGu7nAoeW7UOBc3ra31NGzewKPNrTfRMREX0yYZ+7pNOBNwObSroX+BRwLHCmpCOAu4CDyuHnA28DFgNPAIdPQc0RETGBCcPd9jvH2LXHKMca+MCKFhURESsmd6hGRLRQwj0iooUS7hERLZRwj4hooYR7REQLJdwjIloo4R4R0UIvdOKwiFjFZBK1bsmZe0RECyXcIyJaKOEeEdFCCfeIiBZKuEdEtFDCPSKihRLuEREtlHCPiGihhHtERAsl3CMiWijhHhHRQgn3iIgWSrhHRLRQwj0iooUS7hERLZRwj4hooYR7REQLJdwjIloo4R4R0UIJ94iIFkq4R0S0UMI9IqKFptUuICKi32bNPa92Cdx57L5T+u9PyZm7pL0l3SppsaS5U/E9IiJibCs93CWtDpwA7APsALxT0g4r+/tERMTYpuLMfRdgse07bD8FnAHsNwXfJyIixiDbK/cflA4A9rZ9ZHl8CPBa2x8ccdwcYE55+NvArSu1kBdmU+Dh2kUMiDwXjTwPw/JcDBuU5+JltqePtqPaBVXb84B5tb7/aCQttD27dh2DIM9FI8/DsDwXw1aF52IqumWWAFv1PN6ytEVERJ9MRbgvALaTtLWkNYF3AOdOwfeJiIgxrPRuGdvLJH0QuBBYHTjF9o0r+/tMkYHqJqosz0Ujz8OwPBfDBv65WOkXVCMior5MPxAR0UIJ94iIFkq4R0S0UCcnDpO08SQOe9b2I1NdS22SHpvoEOB+2y/vRz015XUxTNLxkzjsMdufnPJiKpN0/SQOW2p7jykvZjl0MtyB+8qHxjlmdWBmf8qp6qe2dxrvAEk/7lcxleV1MWw/4G8mOGYu0Ppwp/k/f9s4+8UADvfuarjfnEB7zh+vpGPaIK+LYV+0PX+8AyRt1K9iKnuf7bvGO0DS+/tVzGR1ciikpLVtP7mix7SFpGm2l5Xt9YDfAe6w/fO6lfVXXhcxkaGuu1Xhd6OTF1Qn88vZlV9gSYcBD0q6TdI+wPXAccB1kt5Ztbg+G/l/LmlbSX/cO2V1V14Xo5F0W+0aapA0U9IZkpYCVwFXS3qotM2qXN6YOhnukl4p6UpJ90ia1/v2UtLVNWur4CM0s3LuBXwL2LNcGJoNfKxmYf0m6XJJm5btQ4DzadYl+JakP6taXJ9JelzSY+XjcUmPA9sMtdeur8++BZwFbG57O9vbAlsAZ9NMaT6QOhnuwInAMcArgNuA/5C0Tdm3Rq2iKnnG9sO2/xP4pe2fAth+sHJdNUy3PTSN61HA68rU1a8F/qReWVWcShNe29le3/b6wN1l+8V1S+u7TW1/y/YzQw22n7F9BrBJxbrG1dULquvbvqBsf0HSIuCCcrbWtYsQd0v6HLA+cIukfwC+B7wFuL9qZf33tKQZtpcAvwR+Vdp/QzNiojNsHyVpZ+B0SWcDX6F7vxtDFkk6EZgP3FPatgIOBQb2AntXL6heB/y+7Ud72l4JfBfY2PbA/jVe2SS9GPgAzS/uV4C9gcOAu4G/s92ZgJf0ZpolIr8LbAy8mmYCvDcAF9r+QrXiKpG0GvBB4EBgG9svrVxS35XZbY+gGR46ozTfC/wf4GTbv6lV23i6Gu4H04wGuXJE+0zgr2137S14FJI2AA4GXk7zzvZe4Bzbt1QtrDJJWwA72T6/di0xOZ0M9xhWpmc+w/bDkrYFTmH4WsSRtn9StcCoomcthvtsX1JOiH4PuBmYZ/vpqgUOCEl/YPv7tesYTScvqJYumKHtNSR9UtK5kj4raZ2atVXwpz0XEb9Mc/PKRsBHgX+uV1b/Sdpc0kmSTpC0iaRjJF0v6cxy5tolpwL7Ah+S9G803TJXAa8BvlqzsAHzmtoFjKWTZ+6SrrH96rL9DzRXvE8F9gc2sf2eiuX1laRbbf922V5g+zU9+663/cqxv7pdJF0AnAesS9M1cxrwTZrXxVts71evuv4a+r+XNI1mmcyX2n5GkoDruvS6WFV1Ndx/PHSbuaRrgdfYfrqLL1xJn6G5SPS3NG/Dn6AZ07s78Me2/6BieX014nVxt+2ZPfuutb1jteL6TNINNBeU16W5uP4y2z+XtDbwY9vbVy2wzyT9Ds+/oLoEONf2zfWqGl9Xh0JuIOmPaLql1hrqP7RtSZ36a2f7E+Uu1dOBbYC1gDk0Y5zfVa+yKnq7Kb8+zr4uOBm4hWYI6CeAb0u6A9iVAb5xZypI+ijwTpqfe+gmxy1phomeYfvYasWNo6tn7qeOaJpr+0FJmwOnDdrUndEfkv4W+LztX45o3xY41vYBdSqrQ9JLAWzfJ2lDmnsf7rbdqbu4y7QLvzvyInK56Hyj7e3qVDa+ToZ7TI6kPW1fXLuOqKN0U+7C87sirnbHQkPSLcBeI2eGlPQy4KKha1aDpqvdMkM370wfut2+p/2VticzOX8XnEw35i4HnguzA2lu6PoOzXWH/Wi6J/7Z9rMVy+srSW+lmabjdppQh6YrYltJ77d9UbXi+u/DwKWSbmf4DtWZwLY0N3gNpE6euUs6CPgS8BDNXDKH2V5Q9j03kqYLJI21yICA3W2v2896aiq3mL8EWBN4jOb6w7k0QwIftP2hiuX1laSbgX1s3zmifWvg/A5eUF2N//4uZkHvfDODpqtn7h8HdrZ9v6RdgH+T9DHbZzH+Kjxt9Ebg3TRzqfQaekveJW+0/QpJawAPAFvYfkrS6cA1lWvrt6G7c0daQvcm14PmXcsttq8s0/zOBh4Hbqxa1Ti6Gu6rD82ZYvtqSbsB35e0Fd2bHOlK4AnbV4zcIenWCvXUtAygDItdYPup8niZpM50yRSnAAskncHzJ8t6B013XWdImgu8D/iNpC8AfwH8P+DTkk62/Y9VCxxDV7tlfgQc0tvfLml9muF/b7C9Vq3aoh5J/xc4cJTRMpvTjGnu1DsZSdsz+tjum+pV1X+SbqQ5U18HuBP4LdtLJa0LXGX7f9SsbyxdDfdXAb+yvXhE+xrAQbZPq1NZDKLyS7yu7Ydq1xL913O37uo002BvPnRxXdINCfdY5UiaZ3tO7TpisEg6xvYxtevoF0lfo7nIvi7NHdzLgAtoRlOtb/ugetWNrWt33U1I0rzaNQyQf6ldwKCQ1LULquNZVLuAPjuSZu7204FDgJOA1wG3AodXrGtcOXMfQdLOtrv24o2IlsmZ+whdC3ZJs9UsDP0NSVtJuljSo5IWSNqpdn01SXqxpJ3Vs4B6l5Xb8GMV0clwl7S6pPdJ+jtJrx+x75O16qrkRODzNFPd/gj4F9sbAHPLvs4of+A2Ldt7ATcAxwHXSjqwanF9JulxSY+Vj8clPQ5sM9Reu76YWCe7ZSR9lWZY09U0fWhX2D667OvaHarjTXP73L4ukPQT268o2z8CDrZ9Zwn8S22/qm6F/SPpeGBD4C9tP1ja/tP21lULi0nr5Jk7sIvtg21/CXgtsJ6k70lai+7dofqkpLeWM1NL2h9A0puAgb21eoqsVuYcAniWZh5zykpVnbrhz/ZRNCtznS7pqHL7fffOBMehZuW2j0rapHYto+lquK85tGF7WRnudy1wGbBeraIq+V/AR4D3AnsBu0l6hKZL5qiKddXwaeBySe+luQPx25IOLUPhLqhaWQXl+tNbysMrgLUrljOIrqYZFvnF2oWMpqvdMt8AvmH7ghHtRwIn2e7i3BnBc3O3/wnwcobnVznb9oVVC6tMzRqyO9k+v3YtMTmdDPcYn6TLbO9eu46oR9JM4CHbT5apkA+jWXbvJuBfbS+rWV9tkm6z/fLadYyns+FeZoO07QWSdgD2ppn1rVNnJpJGzl0vmrPWWwG6tJ7seCT9je2/rV1Hv6hZQ3UX209IOo5mCcazae7KxPZ7K5bXV2Wk0FBQDl2TW4fmblXbfvGoX1hZJ8Nd0qeAfWjedl9Mc1H1cmBP4ELbn6lYXl+V+dwfA/4e+DXNi/ffgTcAjFx9pqtGjiRqO0k32d6hbC+iWUR+aD6V6zJyaPBHDnVqBECPA4AdaRZjeADY0vZjZTrPq4DOhLvtt6tZLHwe8AXb50p6uouhPs74bQEv6mctA+AeSbvbvoxmJsStgLsGdWTIVLJ9lKSdaUYOnQ18hVVg5FBXR8sss/2M7SeAn9p+DMD2r2mGwHVKWaRkH+DNks6hZzRRxzwCbGf7xSM+1qeZDbBLjgT+WtIPaV4P10q6HLgEOLpqZRWsiiOHunrm/pSkdUq47zzUKGkDOhjuALZ/BRxdpkN+Xe16Kvk68DLgwVH2fbPPtVRl+x6aYbHb01yD+RrNyKEFXVpLtlf5uY+X9G1g4G/u62qf+1q2fzNK+6Y0S6v9pEJZETGAJK1JswLVfbYvkXQw8HvAzcA8209XLXAMnQz3iIjJknQaTS/HOjRdd+sB3wP2oMnQQ+tVN7aEe0TEOHpWYppGs9TgS20/U8b/Xzeow4W7ekE1ImKyVitdM+vTnL1vUNrXAgb2bvauXlCNCUi6uWyeYPsrVYuJgSFpPs3NOyfYvqF2PX1yMnALsDrwCZo5h+4AdgXOqFnYeNIt00PSJcDTNC/c79eup7Zygfm1ts+rXUtN+UM3TNJrgJk0d69+tHY9/SLppQC275O0Ic2wyLttX121sHEk3HuU/8AtgF1tn1C7nhgc5eadXbv+h66rSv/6LsCM0rQEuNoDHKDpcy8kbWL7PtuLuhTsZWm9MyT9u6SPS1qjZ9/ZFUsbCJLeDmD7Z10Kdo2//OKOtevrJ0lvBW4HjgHeVj4+Ddxe9g2kTva5SzqW5lb7hyXNBs4Eni3B9h7bV9StsK9OAb4LXAkcAVwh6Q9t/4zmhp7OkPQ/RzYBJ5RREtj+Xv+rquZE4FM0c6r8CPhz23tK2gM4iW7d6PZl4C227+xtlLQ1cD6wfY2iJtLJbpkRy6ldDvxVmR3y5cA3bc+uW2H/SLrW9o49j98NfAx4O/Dtji05+DRwIfAQw7P/HQB8h2b2vy7NhJjlFwtJtwPbj5zmuIygucn2tnUqG18nz9yBaZKmlf+sF9leAGD7trLUXpesIWlt208C2P6GpAdoQm7duqX13e8Bx9LcYn8SgKQ32z68bllVPFm6HDagLL9o++yOLr94CrBA0hnAPaVtK5q7Vk+uVtUEutrnfiJwvqTdgQskfVnSmyR9mma5vS75Ks2Ux8+xfQlwINCVoW4AlD/yewJrlv7mXVgFZv+bIuMtv/ihinX1ne3PAQfTvJt7XfkQ8K6ybyB1slsGQNJuNC/g5y2nBpwyqHNFRP+UkVNfAmbb/q3K5UQst86GezQkbWr74Z7H76YZ8nUDzXJqeYF0kKTXAjeXdQ5eBMxleJm9z9p+tGqBA0LSMbaPqV3HaDrZLaPGQZIOLNt7SDpe0vslde05uWhoQ9IngUOARTTdE/9Yq6gaJG0u6SRJJ0jaRNIxkq6XdKaaBaK75BSaO1GhGS2yAXBcaTu1VlEDaFHtAsbSyTN3SScCL6FZhOAxmjkizgX2BR603Zk+xRGjIq4B3mj7V2VY6DVDo4q6QNIFwHk0F5IPBk6jmcd9f5qhcPvVq66/JN1se/uyfU3vqKmRI6xiMHV1tMwbbb+iBNgDNHO4PyXpdOCayrX124sk7UTzLm71smgHtp+W1LVREZvZ/icASe+3fVxp/ydJR1Ssq4YbJB1u+1TgOkmzbS8sw4U7f01K0m22X167jvF0NdyXwXMBtsD2U+XxMkldW2Xmfoa7X34uaQvb95fb7ZeN83Vt1Nsl9/Vx9nXBkcCXS1fdw8D/l3QPzVDAI6tW1meSHmd41NTQ/Q/rDLXbfnGdysbX1XB/QNJ6tn9pe++hRkmbA09VrKvvbO82xq5HgN/vYymD4Jye18UnhxolbQvcVrGuvisXTA+T9GJga8qIMtujLUHYdqfS3Kn7l0M/v6T/tL111aom0Mk+97FIWhdY1/ZDtWvpN0lrjBwCOnIkTQTA0B/A2nX0k6Sdgf9NM1z6K8DiQR8i27W3ms8pIyM2L9vTy7wis7oW7JJ2k3QvcL+kiyTN6tl90Rhf1jmSOjMNwyTcVLuAfrO9iGaaX4ArgLUrljMpneyWkfQ+mnG7knQccBjNuO7PSfq87YG9pXgKfB7Yy/aNkg4ALpZ0iO0rGe5fDPhT4E9qF9Evko4eaxfNGqKdY/tZ4HhJ3wYGfm6dTnbLSPoJzS33LwLuAra1/YCkjYDLuzTMS9J1tl/V8/h3aRb//SjwN12aOCyGSXqSphtitIvqf257w/5WVI+kmcBDtp+UJJqTwaEbuv515IRig6KTZ+7A07afAJ6Q9FPbDwDY/oWkrv21e1rS5j3PwY1qpnX9PrBN3dL6T9IGwN48f1GGC20/Uq2oOq4Bzi7dEc8jqVOjZWim9d2lbB9L83txNrA78Bqa+XcGTlf73K3hRSn2HWqUtDbde07mApv1Nti+F3gTzQu5MyS9hybU3kyzEPI6wG7AorKvSw6neVc7ms5MiV2sVk4Goel3P8j2N8oU0DtXrGtcXe2WmQncN8r8zDNo5m2+pE5lUZOkW2nWjH1kRPtGwFWDftNKTA1JFwLH2b5M0neBo23fVe4Fuay3W3OQdO0sFQDbd4/WT2Z7SYJ9mKRjatfQZ2L0KX6fJReXnyNpTu0a+uxI4K8l/ZBmypJr1Szycwkw1oXn6rra5z4mSfNsd+3FO5aBnRRpinwGuEbSRQwvyjCTZhK1v6tW1eDp1B862/fQzGe/Pc0U4V+jmSJ8QRlBM5A62S0zHkk7j3YRKbqhdMHsxX+/oPqLelVFTZI00dTXkzmm3xLugaS9gC2BS3sXAZb0XtunVCusz1bVX+KpIGlj4IPAfTRLyX2cZgWim2nmc+/MHztJP6BZRP4c23f3tK8JvAE4lGYI9deqFDiGTva5S/qgpE3L9raSfijpEUlXSerMFLcAkj4LfAJ4BXCppD/r2f3BOlVVc7mkPysX3J8jaU1Ju0uaT/OL3AXfoJn6eGfgcmBzmvncf03TLdEle9OsG3u6pPsk3STpDuB24J3AlwYt2KGjZ+6SbrT9u2X7POCrts+S9GbgM7ZfX7O+fio3dO1UZsTckGb+8ltt/7m6t8r92jRjlt9FM1nWIzQ3uq1GMxXDibZ/XK3APhqas73ctHOv7Rkj99Wrrp4yhHpT4NeDfu9DVy+o9v7cL7F9FoDtH0hav1JNtUwbGjlk+xFJfwjMK7dYr1m3tP6y/STNAtAnrkq/xFNktXL9YX1gPUmzbN9Zhv916nXRq0yud3/tOiajk90ywHckfU3SbwFnSfqwpJdJOhy4e6IvbpmfSnrT0APbz9g+ArgV2L5eWXXZftr2/R0NdoDPAbcAC2jezXxV0iXA9TQLh8eA62S3DICkw2gmg9qGZpm9e2huKT7OHVr8V83ix9j+9Sj7Zthe0v+qYhBIWp0mI5ZJmgbsCCyxvUqcuXZdZ8M9hg1NfVwmT5sOvJGm3/3GupXFIJH0Wdsfr11HTE5X+9zHJGlP2xfXrqNfMv1xjEbS8SObgEMkrQdg+6j+VxXLI2fuI0i62/bMiY9sh0x/HKNRs17qFTSjhIbuSP0C8BcAtudXKi0mqZNn7pLOHWsXsEk/axkAmf44RrMDzZQLewN/Yfs+SZ9KqK86OhnuNH3K7wZGrgMphudt7gpreP3Urk9/HIXtx4EPq1k79LRyP0heD6uQrob7lcATtq8YuaNM+9olf0SZCbHM4z5kE+AjVSqKgWF7kaTdgfcD/1G7npi89Ll3XOZTidHkdbHq6+TbrHJL9Qof0xKZTyVGk9fFKq6TZ+6r6ixvU2GM+VTWBlanY/OpxLDMs7Pq62q4J9BGkflUYjR5XayaOhnuvfLCjYg26ny4R0S0UScvqEZEtF3CPSKihRLuEREtlHCPiGih/wLTFI2LOr7qQAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "train1['AgeBin'].value_counts().plot(kind='bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.520883Z",
     "start_time": "2020-08-05T13:41:42.515786Z"
    }
   },
   "outputs": [],
   "source": [
    "train3 = train1.drop(['Age','Fare'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.545781Z",
     "start_time": "2020-08-05T13:41:42.522776Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>FareBin</th>\n",
       "      <th>AgeBin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(-0.001, 7.854]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass  SibSp  Parch  Sex_female  Sex_male  Embarked_C  \\\n",
       "0         0       3      1      0           0         1           0   \n",
       "1         1       1      1      0           1         0           1   \n",
       "2         1       3      0      0           1         0           0   \n",
       "3         1       1      1      0           1         0           0   \n",
       "4         0       3      0      0           0         1           0   \n",
       "\n",
       "   Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  New_name_Miss  \\\n",
       "0           0           1                0              0              0   \n",
       "1           0           0                0              0              0   \n",
       "2           0           1                0              0              1   \n",
       "3           0           1                0              0              0   \n",
       "4           0           1                0              0              0   \n",
       "\n",
       "   New_name_Mr  New_name_Mrs            FareBin            AgeBin  \n",
       "0            1             0    (-0.001, 7.854]  (16.336, 32.252]  \n",
       "1            0             1  (39.688, 512.329]  (32.252, 48.168]  \n",
       "2            0             0      (7.854, 10.5]  (16.336, 32.252]  \n",
       "3            0             1  (39.688, 512.329]  (32.252, 48.168]  \n",
       "4            1             0      (7.854, 10.5]  (32.252, 48.168]  "
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.565399Z",
     "start_time": "2020-08-05T13:41:42.547665Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AgeBin</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(0.34, 16.336]</th>\n",
       "      <td>0.550000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(16.336, 32.252]</th>\n",
       "      <td>0.344168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(32.252, 48.168]</th>\n",
       "      <td>0.404255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(48.168, 64.084]</th>\n",
       "      <td>0.434783</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(64.084, 80.0]</th>\n",
       "      <td>0.090909</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  Survived\n",
       "AgeBin                    \n",
       "(0.34, 16.336]    0.550000\n",
       "(16.336, 32.252]  0.344168\n",
       "(32.252, 48.168]  0.404255\n",
       "(48.168, 64.084]  0.434783\n",
       "(64.084, 80.0]    0.090909"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3[['Survived', 'AgeBin']].groupby('AgeBin').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.589397Z",
     "start_time": "2020-08-05T13:41:42.571273Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FareBin</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(-0.001, 7.854]</th>\n",
       "      <td>0.217877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(7.854, 10.5]</th>\n",
       "      <td>0.201087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(10.5, 21.679]</th>\n",
       "      <td>0.424419</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(21.679, 39.688]</th>\n",
       "      <td>0.444444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(39.688, 512.329]</th>\n",
       "      <td>0.642045</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Survived\n",
       "FareBin                    \n",
       "(-0.001, 7.854]    0.217877\n",
       "(7.854, 10.5]      0.201087\n",
       "(10.5, 21.679]     0.424419\n",
       "(21.679, 39.688]   0.444444\n",
       "(39.688, 512.329]  0.642045"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3[['Survived', 'FareBin']].groupby('FareBin').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.617998Z",
     "start_time": "2020-08-05T13:41:42.591402Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>FareBin</th>\n",
       "      <th>AgeBin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(-0.001, 7.854]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \\\n",
       "0         0       3  22.0      1      0   7.2500           0         1   \n",
       "1         1       1  38.0      1      0  71.2833           1         0   \n",
       "2         1       3  26.0      0      0   7.9250           1         0   \n",
       "3         1       1  35.0      1      0  53.1000           1         0   \n",
       "4         0       3  35.0      0      0   8.0500           0         1   \n",
       "\n",
       "   Embarked_C  Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  \\\n",
       "0           0           0           1                0              0   \n",
       "1           1           0           0                0              0   \n",
       "2           0           0           1                0              0   \n",
       "3           0           0           1                0              0   \n",
       "4           0           0           1                0              0   \n",
       "\n",
       "   New_name_Miss  New_name_Mr  New_name_Mrs            FareBin  \\\n",
       "0              0            1             0    (-0.001, 7.854]   \n",
       "1              0            0             1  (39.688, 512.329]   \n",
       "2              1            0             0      (7.854, 10.5]   \n",
       "3              0            0             1  (39.688, 512.329]   \n",
       "4              0            1             0      (7.854, 10.5]   \n",
       "\n",
       "             AgeBin  \n",
       "0  (16.336, 32.252]  \n",
       "1  (32.252, 48.168]  \n",
       "2  (16.336, 32.252]  \n",
       "3  (32.252, 48.168]  \n",
       "4  (32.252, 48.168]  "
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.625904Z",
     "start_time": "2020-08-05T13:41:42.620387Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dtype('float64')"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train1.Age.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.639340Z",
     "start_time": "2020-08-05T13:41:42.628056Z"
    }
   },
   "outputs": [],
   "source": [
    "def func_age(x):\n",
    "    if x > 64:\n",
    "        x = 0\n",
    "    elif 16 < x <= 32:\n",
    "        x = 1\n",
    "    elif 32 < x <= 48:\n",
    "        x = 2\n",
    "    elif 48 < x <= 64:\n",
    "        x = 3\n",
    "    else:\n",
    "        x = 5\n",
    "    return x # 注意这里要return x\n",
    "\n",
    "train3['Age_code'] = train1.Age.apply(func_age)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.650788Z",
     "start_time": "2020-08-05T13:41:42.641529Z"
    }
   },
   "outputs": [],
   "source": [
    "def func_fare(x):\n",
    "    if 7 < x <= 10:\n",
    "        x = 0\n",
    "    elif x <= 7:\n",
    "        x = 1\n",
    "    elif 10 < x <= 21:\n",
    "        x = 2\n",
    "    elif 21 < x <= 39:\n",
    "        x = 3\n",
    "    else:\n",
    "        x = 4\n",
    "    return x\n",
    "\n",
    "train3['Fare_code'] = train1.Fare.apply(func_fare)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.669131Z",
     "start_time": "2020-08-05T13:41:42.653490Z"
    }
   },
   "outputs": [],
   "source": [
    "coder = LabelEncoder()\n",
    "\n",
    "train3['Fare_labelcode'] = coder.fit_transform(train3.FareBin)\n",
    "train3['Age_labelcode'] = coder.fit_transform(train3.AgeBin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.687962Z",
     "start_time": "2020-08-05T13:41:42.671176Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Fare_labelcode</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.217877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.201087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.424419</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.444444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.642045</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                Survived\n",
       "Fare_labelcode          \n",
       "0               0.217877\n",
       "1               0.201087\n",
       "2               0.424419\n",
       "3               0.444444\n",
       "4               0.642045"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3[['Survived', 'Fare_labelcode']].groupby('Fare_labelcode').mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "由此知道，LabelEncoder就是按顺序进行编号。我们先跑准确率。再测试按程度编号效果怎么样"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型测试"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们目前只是将年龄和票价离散化处理了。等下尝试创建一些新的特征。先测试下模型效果怎么样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.707671Z",
     "start_time": "2020-08-05T13:41:42.690120Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>FareBin</th>\n",
       "      <th>AgeBin</th>\n",
       "      <th>Age_code</th>\n",
       "      <th>Fare_code</th>\n",
       "      <th>Fare_labelcode</th>\n",
       "      <th>Age_labelcode</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(-0.001, 7.854]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass  SibSp  Parch  Sex_female  Sex_male  Embarked_C  \\\n",
       "0         0       3      1      0           0         1           0   \n",
       "1         1       1      1      0           1         0           1   \n",
       "2         1       3      0      0           1         0           0   \n",
       "3         1       1      1      0           1         0           0   \n",
       "4         0       3      0      0           0         1           0   \n",
       "\n",
       "   Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  New_name_Miss  \\\n",
       "0           0           1                0              0              0   \n",
       "1           0           0                0              0              0   \n",
       "2           0           1                0              0              1   \n",
       "3           0           1                0              0              0   \n",
       "4           0           1                0              0              0   \n",
       "\n",
       "   New_name_Mr  New_name_Mrs            FareBin            AgeBin  Age_code  \\\n",
       "0            1             0    (-0.001, 7.854]  (16.336, 32.252]         1   \n",
       "1            0             1  (39.688, 512.329]  (32.252, 48.168]         2   \n",
       "2            0             0      (7.854, 10.5]  (16.336, 32.252]         1   \n",
       "3            0             1  (39.688, 512.329]  (32.252, 48.168]         2   \n",
       "4            1             0      (7.854, 10.5]  (32.252, 48.168]         2   \n",
       "\n",
       "   Fare_code  Fare_labelcode  Age_labelcode  \n",
       "0          0               0              1  \n",
       "1          4               4              2  \n",
       "2          0               1              1  \n",
       "3          4               4              2  \n",
       "4          0               1              2  "
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:42.716818Z",
     "start_time": "2020-08-05T13:41:42.708954Z"
    }
   },
   "outputs": [],
   "source": [
    "x3, y3 = train3.drop(['Survived', 'AgeBin', 'FareBin'], axis=1), train3['Survived']\n",
    "# 这里要剔除掉AgeBin这些是因为这两个都不是category类。运行下面程序会报错"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:43.771241Z",
     "start_time": "2020-08-05T13:41:42.718532Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>    0.834081\n",
      "<class 'sklearn.ensemble._forest.RandomForestClassifier'>        0.820628\n",
      "<class 'sklearn.svm._classes.SVC'>                               0.811659\n",
      "<class 'sklearn.naive_bayes.GaussianNB'>                         0.784753\n",
      "<class 'sklearn.tree._classes.DecisionTreeClassifier'>           0.780269\n",
      "<class 'sklearn.naive_bayes.BernoulliNB'>                        0.775785\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "def modeling(model_list, x, y):\n",
    "    d = {}\n",
    "    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2020)\n",
    "    for model in model_list:\n",
    "        model.fit(x_train, y_train)\n",
    "        score = model.score(x_test, y_test)\n",
    "#         print(f'{model.__class__} : {score}')\n",
    "        d[model.__class__] = score # 获得属性的名字\n",
    "    print(pd.Series(d).sort_values(ascending=False))\n",
    "\n",
    "\n",
    "modeling(MODEL, x3, y3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "****\n",
    "这里和之前没有离散化年龄和票价的情况下进行对比\n",
    "\n",
    "<class 'sklearn.svm._classes.SVC'>                               0.834081\n",
    "\n",
    "<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>    0.807175\n",
    "\n",
    "<class 'sklearn.ensemble._forest.RandomForestClassifier'>        0.807175\n",
    "\n",
    "<class 'sklearn.naive_bayes.GaussianNB'>                         0.798206\n",
    "\n",
    "<class 'sklearn.naive_bayes.BernoulliNB'>                        0.766816\n",
    "\n",
    "<class 'sklearn.tree._classes.DecisionTreeClassifier'>           0.744395"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:43.788675Z",
     "start_time": "2020-08-05T13:41:43.775488Z"
    }
   },
   "outputs": [],
   "source": [
    "model_param_dict = {\n",
    "# 'LogisticRegressionCV'\n",
    "    0:{\n",
    "    'penalty':['l1','l2','elasticnet'],\n",
    "    'solver':['newton-cg', 'lbfgs','liblinear','sag','saga'],\n",
    "    'class_weight':['balance',None],\n",
    "#     'cv':[2,3,4],\n",
    "},\n",
    "\n",
    "# 'RandomForestClassifier'\n",
    "    1:{\n",
    "    'n_estimators':[90,100,110],\n",
    "    'criterion':['gini','entropy'],\n",
    "    'max_depth':[i for i in range(15)],\n",
    "    'min_samples_split':[i for i in range(5)],\n",
    "    'class_weight':['balanced', 'balanced_subsample'],\n",
    "},\n",
    "\n",
    "# GaussianNB_param = {}\n",
    "\n",
    "# BernoulliNB_param = {   }\n",
    "\n",
    "# 'DecisionTreeClassifier'\n",
    "    2:{\n",
    "    'criterion':['gini','entropy'],\n",
    "    'splitter':['best','random'],\n",
    "    'max_depth':[i for i in range(5,15)],\n",
    "    'min_samples_split':[i for i in range(5)],\n",
    "#     'minz_samples_leaf':[i for i in range(5)],\n",
    "    'max_features':['auto', 'sqrt', 'log2'],\n",
    "    'class_weight':['balanced',None],\n",
    "    \n",
    "},\n",
    "    3:{\n",
    "    'C':np.linspace(0,2,20),\n",
    "    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],\n",
    "    'degree':[i for i in range(5)],\n",
    "    'gamma':['scale', 'auto'],\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:41:44.115457Z",
     "start_time": "2020-08-05T13:41:43.793649Z"
    }
   },
   "outputs": [],
   "source": [
    "x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=.3, random_state=2020)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:44:09.313045Z",
     "start_time": "2020-08-05T13:41:44.117452Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LogisticRegressionCV()\n",
      "{'class_weight': 'balance', 'penalty': 'l1', 'solver': 'liblinear'}\n",
      "0.835820895522388\n",
      "--------------------\n",
      "RandomForestClassifier()\n",
      "{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 7, 'min_samples_split': 4, 'n_estimators': 90}\n",
      "0.8022388059701493\n",
      "--------------------\n",
      "DecisionTreeClassifier()\n",
      "{'class_weight': None, 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 3, 'splitter': 'best'}\n",
      "0.7798507462686567\n",
      "--------------------\n",
      "SVC()\n",
      "{'C': 0.21052631578947367, 'degree': 0, 'gamma': 'scale', 'kernel': 'linear'}\n",
      "0.8246268656716418\n",
      "--------------------\n"
     ]
    }
   ],
   "source": [
    "model_list = [\n",
    "    LogisticRegressionCV(),\n",
    "    RandomForestClassifier(), \n",
    "    DecisionTreeClassifier(),\n",
    "    SVC(),\n",
    "]\n",
    "\n",
    "for num, model in enumerate(model_list):\n",
    "    gscv = GridSearchCV(model, param_grid=model_param_dict[num], cv=2, n_jobs=-1)\n",
    "    gscv.fit(x3_train,y3_train)\n",
    "    score = gscv.score(x3_test,y3_test)\n",
    "    print(model)\n",
    "    print(gscv.best_params_)\n",
    "    print(score)\n",
    "    print('-'*20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "LogisticRegressionCV() 0.8251121076233184\n",
    "\n",
    "RandomForestClassifier() 0.820627802690583\n",
    "\n",
    "DecisionTreeClassifier() 0.7802690582959642\n",
    "\n",
    "<class 'sklearn.svm._classes.SVC'> 0.834081"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "看来还是之前的svc效果比较好，这里的svc是没有离散化的情况下"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "****\n",
    "**上面的情况是使用label encoder的效果。接下来是我自己使用分类的效果。附带了程度。即生存率越高。则数值越大**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:44:09.337836Z",
     "start_time": "2020-08-05T13:44:09.315274Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>FareBin</th>\n",
       "      <th>AgeBin</th>\n",
       "      <th>Age_code</th>\n",
       "      <th>Fare_code</th>\n",
       "      <th>Fare_labelcode</th>\n",
       "      <th>Age_labelcode</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(-0.001, 7.854]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass  SibSp  Parch  Sex_female  Sex_male  Embarked_C  \\\n",
       "0         0       3      1      0           0         1           0   \n",
       "1         1       1      1      0           1         0           1   \n",
       "2         1       3      0      0           1         0           0   \n",
       "3         1       1      1      0           1         0           0   \n",
       "4         0       3      0      0           0         1           0   \n",
       "\n",
       "   Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  New_name_Miss  \\\n",
       "0           0           1                0              0              0   \n",
       "1           0           0                0              0              0   \n",
       "2           0           1                0              0              1   \n",
       "3           0           1                0              0              0   \n",
       "4           0           1                0              0              0   \n",
       "\n",
       "   New_name_Mr  New_name_Mrs            FareBin            AgeBin  Age_code  \\\n",
       "0            1             0    (-0.001, 7.854]  (16.336, 32.252]         1   \n",
       "1            0             1  (39.688, 512.329]  (32.252, 48.168]         2   \n",
       "2            0             0      (7.854, 10.5]  (16.336, 32.252]         1   \n",
       "3            0             1  (39.688, 512.329]  (32.252, 48.168]         2   \n",
       "4            1             0      (7.854, 10.5]  (32.252, 48.168]         2   \n",
       "\n",
       "   Fare_code  Fare_labelcode  Age_labelcode  \n",
       "0          0               0              1  \n",
       "1          4               4              2  \n",
       "2          0               1              1  \n",
       "3          4               4              2  \n",
       "4          0               1              2  "
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:44:09.638512Z",
     "start_time": "2020-08-05T13:44:09.339426Z"
    }
   },
   "outputs": [],
   "source": [
    "x3, y3 = train3.drop(['Survived', 'AgeBin', 'FareBin'], axis=1), train3.Survived"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:44:09.685127Z",
     "start_time": "2020-08-05T13:44:09.641014Z"
    }
   },
   "outputs": [],
   "source": [
    "x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=.3, random_state=2020)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:46:31.632319Z",
     "start_time": "2020-08-05T13:44:09.687449Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LogisticRegressionCV()\n",
      "{'class_weight': 'balance', 'penalty': 'l1', 'solver': 'liblinear'}\n",
      "0.835820895522388\n",
      "--------------------\n",
      "RandomForestClassifier()\n",
      "{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 4, 'min_samples_split': 4, 'n_estimators': 110}\n",
      "0.8134328358208955\n",
      "--------------------\n",
      "DecisionTreeClassifier()\n",
      "{'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_split': 4, 'splitter': 'random'}\n",
      "0.8022388059701493\n",
      "--------------------\n",
      "SVC()\n",
      "{'C': 0.21052631578947367, 'degree': 0, 'gamma': 'scale', 'kernel': 'linear'}\n",
      "0.8246268656716418\n",
      "--------------------\n"
     ]
    }
   ],
   "source": [
    "model_list = [\n",
    "    LogisticRegressionCV(),\n",
    "    RandomForestClassifier(), \n",
    "    DecisionTreeClassifier(),\n",
    "    SVC(),\n",
    "]\n",
    "\n",
    "for num, model in enumerate(model_list):\n",
    "    gscv = GridSearchCV(model, param_grid=model_param_dict[num], cv=2, n_jobs=-1)\n",
    "    gscv.fit(x3_train,y3_train)\n",
    "    score = gscv.score(x3_test,y3_test)\n",
    "    print(model)\n",
    "    print(gscv.best_params_)\n",
    "    print(score)\n",
    "    print('-'*20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 中期总结"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**前面船票和年龄这两类数据都是连续的，并没有对数据进行归一化处理，归一化处理后：svm的准确度达到了最高0.839**\n",
    "\n",
    "**将数据进行离散化处理后，svm的效果反而下降了，但是其他模型的准确度却有所提升**\n",
    "\n",
    "**尝试了使用Labelencoder进行定距等级。这个模块是按顺序的。没有体现不喜欢是0。所以自己尝试了按定序等级程度编号，效果提升不明显**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 创建新的特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:46:31.639079Z",
     "start_time": "2020-08-05T13:46:31.634148Z"
    }
   },
   "outputs": [],
   "source": [
    "train3.drop(['AgeBin', 'FareBin'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:46:31.659303Z",
     "start_time": "2020-08-05T13:46:31.640636Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>Age_code</th>\n",
       "      <th>Fare_code</th>\n",
       "      <th>Fare_labelcode</th>\n",
       "      <th>Age_labelcode</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass  SibSp  Parch  Sex_female  Sex_male  Embarked_C  \\\n",
       "0         0       3      1      0           0         1           0   \n",
       "1         1       1      1      0           1         0           1   \n",
       "2         1       3      0      0           1         0           0   \n",
       "3         1       1      1      0           1         0           0   \n",
       "4         0       3      0      0           0         1           0   \n",
       "\n",
       "   Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  New_name_Miss  \\\n",
       "0           0           1                0              0              0   \n",
       "1           0           0                0              0              0   \n",
       "2           0           1                0              0              1   \n",
       "3           0           1                0              0              0   \n",
       "4           0           1                0              0              0   \n",
       "\n",
       "   New_name_Mr  New_name_Mrs  Age_code  Fare_code  Fare_labelcode  \\\n",
       "0            1             0         1          0               0   \n",
       "1            0             1         2          4               4   \n",
       "2            0             0         1          0               1   \n",
       "3            0             1         2          4               4   \n",
       "4            1             0         2          0               1   \n",
       "\n",
       "   Age_labelcode  \n",
       "0              1  \n",
       "1              2  \n",
       "2              1  \n",
       "3              2  \n",
       "4              2  "
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TODO\n",
    "\n",
    "- 接下来创建一些新的特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T13:46:31.685447Z",
     "start_time": "2020-08-05T13:46:31.661421Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>FareBin</th>\n",
       "      <th>AgeBin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(-0.001, 7.854]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \\\n",
       "0         0       3  22.0      1      0   7.2500           0         1   \n",
       "1         1       1  38.0      1      0  71.2833           1         0   \n",
       "2         1       3  26.0      0      0   7.9250           1         0   \n",
       "3         1       1  35.0      1      0  53.1000           1         0   \n",
       "4         0       3  35.0      0      0   8.0500           0         1   \n",
       "\n",
       "   Embarked_C  Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  \\\n",
       "0           0           0           1                0              0   \n",
       "1           1           0           0                0              0   \n",
       "2           0           0           1                0              0   \n",
       "3           0           0           1                0              0   \n",
       "4           0           0           1                0              0   \n",
       "\n",
       "   New_name_Miss  New_name_Mr  New_name_Mrs            FareBin  \\\n",
       "0              0            1             0    (-0.001, 7.854]   \n",
       "1              0            0             1  (39.688, 512.329]   \n",
       "2              1            0             0      (7.854, 10.5]   \n",
       "3              0            0             1  (39.688, 512.329]   \n",
       "4              0            1             0      (7.854, 10.5]   \n",
       "\n",
       "             AgeBin  \n",
       "0  (16.336, 32.252]  \n",
       "1  (32.252, 48.168]  \n",
       "2  (16.336, 32.252]  \n",
       "3  (32.252, 48.168]  \n",
       "4  (32.252, 48.168]  "
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "****\n",
    "\n",
    "创建两个新的特征：\n",
    "1. 家庭大小\n",
    "2. 是否是一个人"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = train1['SibSp'] + train1['Parch']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     537\n",
       "1     161\n",
       "2     102\n",
       "3      29\n",
       "5      22\n",
       "4      15\n",
       "6      12\n",
       "10      7\n",
       "7       6\n",
       "dtype: int64"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "del train4\n",
    "train4 = train1.copy()\n",
    "train4['Family_size'] = temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果是一个人则是1 否则就是0\n",
    "train4['IsAlone'] = train4['Family_size'].apply(lambda x: 0 if x else 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>New_name_Master</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>FareBin</th>\n",
       "      <th>AgeBin</th>\n",
       "      <th>Family_size</th>\n",
       "      <th>IsAlone</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(-0.001, 7.854]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \\\n",
       "0         0       3  22.0      1      0   7.2500           0         1   \n",
       "1         1       1  38.0      1      0  71.2833           1         0   \n",
       "2         1       3  26.0      0      0   7.9250           1         0   \n",
       "3         1       1  35.0      1      0  53.1000           1         0   \n",
       "4         0       3  35.0      0      0   8.0500           0         1   \n",
       "\n",
       "   Embarked_C  Embarked_Q  Embarked_S  New_name_Master  New_name_Misc  \\\n",
       "0           0           0           1                0              0   \n",
       "1           1           0           0                0              0   \n",
       "2           0           0           1                0              0   \n",
       "3           0           0           1                0              0   \n",
       "4           0           0           1                0              0   \n",
       "\n",
       "   New_name_Miss  New_name_Mr  New_name_Mrs            FareBin  \\\n",
       "0              0            1             0    (-0.001, 7.854]   \n",
       "1              0            0             1  (39.688, 512.329]   \n",
       "2              1            0             0      (7.854, 10.5]   \n",
       "3              0            0             1  (39.688, 512.329]   \n",
       "4              0            1             0      (7.854, 10.5]   \n",
       "\n",
       "             AgeBin  Family_size  IsAlone  \n",
       "0  (16.336, 32.252]            1        0  \n",
       "1  (32.252, 48.168]            1        0  \n",
       "2  (16.336, 32.252]            0        1  \n",
       "3  (32.252, 48.168]            1        0  \n",
       "4  (32.252, 48.168]            0        1  "
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train4.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del x4,y4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "x4, y4 = train4.drop(['Survived', 'FareBin', 'AgeBin'], axis=1), train4['Survived']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "没有归一化处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'sklearn.ensemble._forest.RandomForestClassifier'>        0.807175\n",
      "<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>    0.798206\n",
      "<class 'sklearn.naive_bayes.GaussianNB'>                         0.789238\n",
      "<class 'sklearn.naive_bayes.BernoulliNB'>                        0.775785\n",
      "<class 'sklearn.tree._classes.DecisionTreeClassifier'>           0.744395\n",
      "<class 'sklearn.svm._classes.SVC'>                               0.627803\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "def modeling(model_list, x, y):\n",
    "    d = {}\n",
    "    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2020)\n",
    "    for model in model_list:\n",
    "        model.fit(x_train, y_train)\n",
    "        score = model.score(x_test, y_test)\n",
    "#         print(f'{model.__class__} : {score}')\n",
    "        d[model.__class__] = score # 获得属性的名字\n",
    "    print(pd.Series(d).sort_values(ascending=False))\n",
    "\n",
    "modeling(MODEL, x4, y4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "归一化处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "min_max = MinMaxScaler()\n",
    "temp = pd.DataFrame(min_max.fit_transform(train4[['Age','Fare']]),columns=['AgeScaler','FareScaler'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [],
   "source": [
    "train4 = pd.concat([train4, temp], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>...</th>\n",
       "      <th>New_name_Misc</th>\n",
       "      <th>New_name_Miss</th>\n",
       "      <th>New_name_Mr</th>\n",
       "      <th>New_name_Mrs</th>\n",
       "      <th>FareBin</th>\n",
       "      <th>AgeBin</th>\n",
       "      <th>Family_size</th>\n",
       "      <th>IsAlone</th>\n",
       "      <th>AgeScaler</th>\n",
       "      <th>FareScaler</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(-0.001, 7.854]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.271174</td>\n",
       "      <td>0.014151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.472229</td>\n",
       "      <td>0.139136</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(16.336, 32.252]</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.321438</td>\n",
       "      <td>0.015469</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(39.688, 512.329]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.434531</td>\n",
       "      <td>0.103644</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>(7.854, 10.5]</td>\n",
       "      <td>(32.252, 48.168]</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.434531</td>\n",
       "      <td>0.015713</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \\\n",
       "0         0       3  22.0      1      0   7.2500           0         1   \n",
       "1         1       1  38.0      1      0  71.2833           1         0   \n",
       "2         1       3  26.0      0      0   7.9250           1         0   \n",
       "3         1       1  35.0      1      0  53.1000           1         0   \n",
       "4         0       3  35.0      0      0   8.0500           0         1   \n",
       "\n",
       "   Embarked_C  Embarked_Q  ...  New_name_Misc  New_name_Miss  New_name_Mr  \\\n",
       "0           0           0  ...              0              0            1   \n",
       "1           1           0  ...              0              0            0   \n",
       "2           0           0  ...              0              1            0   \n",
       "3           0           0  ...              0              0            0   \n",
       "4           0           0  ...              0              0            1   \n",
       "\n",
       "   New_name_Mrs            FareBin            AgeBin Family_size IsAlone  \\\n",
       "0             0    (-0.001, 7.854]  (16.336, 32.252]           1       0   \n",
       "1             1  (39.688, 512.329]  (32.252, 48.168]           1       0   \n",
       "2             0      (7.854, 10.5]  (16.336, 32.252]           0       1   \n",
       "3             1  (39.688, 512.329]  (32.252, 48.168]           1       0   \n",
       "4             0      (7.854, 10.5]  (32.252, 48.168]           0       1   \n",
       "\n",
       "   AgeScaler  FareScaler  \n",
       "0   0.271174    0.014151  \n",
       "1   0.472229    0.139136  \n",
       "2   0.321438    0.015469  \n",
       "3   0.434531    0.103644  \n",
       "4   0.434531    0.015713  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train4.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "del x4,y4\n",
    "x4, y4 = train4.drop(['Survived', 'FareBin', 'AgeBin','Age','Fare'], axis=1), train4['Survived']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'sklearn.svm._classes.SVC'>                               0.825112\n",
      "<class 'sklearn.ensemble._forest.RandomForestClassifier'>        0.816143\n",
      "<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>    0.802691\n",
      "<class 'sklearn.naive_bayes.GaussianNB'>                         0.789238\n",
      "<class 'sklearn.naive_bayes.BernoulliNB'>                        0.775785\n",
      "<class 'sklearn.tree._classes.DecisionTreeClassifier'>           0.753363\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "modeling(MODEL, x4, y4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_param_dict = {\n",
    "# 'LogisticRegressionCV'\n",
    "    0:{\n",
    "    'penalty':['l1','l2','elasticnet'],\n",
    "    'solver':['newton-cg', 'lbfgs','liblinear','sag','saga'],\n",
    "    'class_weight':['balance',None],\n",
    "#     'cv':[2,3,4],\n",
    "},\n",
    "\n",
    "# 'RandomForestClassifier'\n",
    "    1:{\n",
    "    'n_estimators':[90,100,110],\n",
    "    'criterion':['gini','entropy'],\n",
    "    'max_depth':[i for i in range(15)],\n",
    "    'min_samples_split':[i for i in range(5)],\n",
    "    'class_weight':['balanced', 'balanced_subsample'],\n",
    "},\n",
    "\n",
    "# GaussianNB_param = {}\n",
    "\n",
    "# BernoulliNB_param = {   }\n",
    "\n",
    "# 'DecisionTreeClassifier'\n",
    "    2:{\n",
    "    'criterion':['gini','entropy'],\n",
    "    'splitter':['best','random'],\n",
    "    'max_depth':[i for i in range(5,15)],\n",
    "    'min_samples_split':[i for i in range(5)],\n",
    "#     'minz_samples_leaf':[i for i in range(5)],\n",
    "    'max_features':['auto', 'sqrt', 'log2'],\n",
    "    'class_weight':['balanced',None],\n",
    "    \n",
    "},\n",
    "# svm\n",
    "    3:{\n",
    "    'C':np.linspace(0,2,20),\n",
    "    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],\n",
    "    'degree':[i for i in range(5)],\n",
    "    'gamma':['scale', 'auto'],\n",
    "}\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LogisticRegressionCV()\n",
      "{'class_weight': 'balance', 'penalty': 'l1', 'solver': 'liblinear'}\n",
      "0.8161434977578476\n",
      "--------------------\n",
      "RandomForestClassifier()\n",
      "{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 8, 'min_samples_split': 4, 'n_estimators': 90}\n",
      "0.8161434977578476\n",
      "--------------------\n",
      "DecisionTreeClassifier()\n",
      "{'class_weight': None, 'criterion': 'entropy', 'max_depth': 7, 'max_features': 'log2', 'min_samples_split': 4, 'splitter': 'random'}\n",
      "0.7847533632286996\n",
      "--------------------\n",
      "SVC()\n",
      "{'C': 2.0, 'degree': 0, 'gamma': 'auto', 'kernel': 'rbf'}\n",
      "0.8251121076233184\n",
      "--------------------\n"
     ]
    }
   ],
   "source": [
    "model_list = [\n",
    "    LogisticRegressionCV(),\n",
    "    RandomForestClassifier(), \n",
    "    DecisionTreeClassifier(),\n",
    "    SVC()\n",
    "]\n",
    "x_train, x_test, y_train, y_test = train_test_split(x4, y4, random_state=2020)\n",
    "for num, model in enumerate(model_list):\n",
    "    gscv = GridSearchCV(model, param_grid=model_param_dict[num], cv=2, n_jobs=-1)\n",
    "    gscv.fit(x_train,y_train)\n",
    "    score = gscv.score(x_test,y_test)\n",
    "    print(model)\n",
    "    print(gscv.best_params_)\n",
    "    print(score)\n",
    "    print('-'*20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 问题思考"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "其实在上面数据集的划分中，只是简单的使用了train_test_split。然而模型的预测好坏，对训练集的划分结果影响也是比较大的。所以在接下来的时间中，我们要尝试不同训练集划分的模型得分"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型融合"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "想到模型融合，其实常见的有两个算法已经帮我们实现了\n",
    "- 随机森林\n",
    "- Adaboost\n",
    "\n",
    "随机森林属于bagging方法。也就是有放回地抽样。对于分类问题采用投票方式得到结果。对于回归问题采用取平均数的方式\n",
    "\n",
    "Adaboost采用boost方式。第一次随机抽样，然后对模型进行训练和预测，预测错误的，很大概率会进入到下一轮的训练集中"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## bagging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = np.zeros(4, dtype='int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 0])"
      ]
     },
     "execution_count": 183,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 2, 3])"
      ]
     },
     "execution_count": 184,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp = temp + np.arange(4)\n",
    "temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.        , 0.33333333, 0.66666667, 1.        ])"
      ]
     },
     "execution_count": 185,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp = temp/3\n",
    "temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 1])"
      ]
     },
     "execution_count": 186,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp.astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n00000\\n11110\\n10110\\n\\n21220\\n\\n10110\\n'"
      ]
     },
     "execution_count": 189,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# train 表示训练集\n",
    "# test 表示测试集\n",
    "# y表示训练集的标签，注意训练集特征和标签是分开的\n",
    "\"\"\"\n",
    "_model = RandomForestClassifier()\n",
    "_bags = 10\n",
    "seed = 1\n",
    "_bagged_prediction = np.zeros(_test.shape[0])\n",
    "for n in range(bags):\n",
    "    model.set_params(random_state=seed+n)\n",
    "    model.fit(train,y)\n",
    "    preds=model.predict(test)\n",
    "    bagged_prediction += preds # 只是对应的结果相加罢了\n",
    "\n",
    "bagged_prediction = bagged_prediction/_bags # 得到最后的预测结果求平均\n",
    "\"\"\"\n",
    "\n",
    "\"\"\"\n",
    "00000\n",
    "11110\n",
    "10110\n",
    "\n",
    "21220\n",
    "\n",
    "10110\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## blending"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "什么是blending呢？\n",
    "\n",
    "比如我们现在预测年龄。使用了两个模型进行预测 model1 model2\n",
    "\n",
    "参考资料 https://www.jianshu.com/p/8ea67f320e97"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "model1在年龄小于50 的时候，预测的准确度比较好。\n",
    "\n",
    "model2在年龄大于50的时候比较好。\n",
    "\n",
    "blending就是在最后的预测上，年龄小于50的时候使用model1的结果。年龄大于50的时候使用model2的结果\n",
    "\n",
    "就好比在预测年龄的10分类上。model1在1-5准确度高。model2在其余部分准确度高"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## stacking"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 参考资料"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "知乎 https://zhuanlan.zhihu.com/p/26890738\n",
    "\n",
    "kaggle https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python#Ensembling-&-Stacking-models\n",
    "\n",
    "https://www.6aiq.com/article/1536427413103"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**什么是stacking，假如现在有三个模型，训练集对模型训练后，再分别利用三个模型，对测试集进行预测，会得到三列输出。**\n",
    "\n",
    "我们再利用这三列输出，作为训练集放到另一个模型中进行训练，得到最后的结果\n",
    "\n",
    "参考简书  https://www.jianshu.com/p/ad6b078a89c7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 432,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# !pip install civisml-extensions\n",
    "# 这个模块似乎是集成好的stacking函数"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 处理test数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "接下来我将参考[kaggle](https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python)上的stacking教程，一步步运行看看stacking的原理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# train3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>892</td>\n",
       "      <td>3</td>\n",
       "      <td>Kelly, Mr. James</td>\n",
       "      <td>male</td>\n",
       "      <td>34.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>330911</td>\n",
       "      <td>7.8292</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>893</td>\n",
       "      <td>3</td>\n",
       "      <td>Wilkes, Mrs. James (Ellen Needs)</td>\n",
       "      <td>female</td>\n",
       "      <td>47.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>363272</td>\n",
       "      <td>7.0000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>894</td>\n",
       "      <td>2</td>\n",
       "      <td>Myles, Mr. Thomas Francis</td>\n",
       "      <td>male</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>240276</td>\n",
       "      <td>9.6875</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>895</td>\n",
       "      <td>3</td>\n",
       "      <td>Wirz, Mr. Albert</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>315154</td>\n",
       "      <td>8.6625</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>896</td>\n",
       "      <td>3</td>\n",
       "      <td>Hirvonen, Mrs. Alexander (Helga E Lindqvist)</td>\n",
       "      <td>female</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3101298</td>\n",
       "      <td>12.2875</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Pclass                                          Name     Sex  \\\n",
       "0          892       3                              Kelly, Mr. James    male   \n",
       "1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   \n",
       "2          894       2                     Myles, Mr. Thomas Francis    male   \n",
       "3          895       3                              Wirz, Mr. Albert    male   \n",
       "4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   \n",
       "\n",
       "    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  \n",
       "0  34.5      0      0   330911   7.8292   NaN        Q  \n",
       "1  47.0      1      0   363272   7.0000   NaN        S  \n",
       "2  62.0      0      0   240276   9.6875   NaN        Q  \n",
       "3  27.0      0      0   315154   8.6625   NaN        S  \n",
       "4  22.0      1      1  3101298  12.2875   NaN        S  "
      ]
     },
     "execution_count": 239,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()\n",
    "# 这里没有对test数据进行处理，所以接下来对这部分数据进行处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 从name中抽取有用信息\n",
    "\n",
    "test['Title'] = test['Name'].str.extract(', (\\w+)\\.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 245,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = test['Title'].value_counts()\n",
    "test['Title']=test['Title'].apply(lambda x:'Misc' if temp[x] < 10 else x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId      0\n",
       "Pclass           0\n",
       "Name             0\n",
       "Sex              0\n",
       "Age              0\n",
       "SibSp            0\n",
       "Parch            0\n",
       "Ticket           0\n",
       "Fare             0\n",
       "Cabin          327\n",
       "Embarked         0\n",
       "Title            0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 254,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.isnull().sum()\n",
    "# 可以发现age和fare有缺失，cabin等下我们丢弃即可"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 253,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 填补缺失值，年龄的话先使用中位数进行填充\n",
    "test['Age'].fillna(test.Age.median(),inplace=True)\n",
    "test['Fare'].fillna(test.Fare.median(), inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "metadata": {},
   "outputs": [],
   "source": [
    "drop_list = ['PassengerId', 'Name', 'Cabin', 'Ticket']\n",
    "test.drop(drop_list, axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>34.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8292</td>\n",
       "      <td>Q</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>47.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.0000</td>\n",
       "      <td>S</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9.6875</td>\n",
       "      <td>Q</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.6625</td>\n",
       "      <td>S</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>12.2875</td>\n",
       "      <td>S</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Pclass     Sex   Age  SibSp  Parch     Fare Embarked Title\n",
       "0       3    male  34.5      0      0   7.8292        Q    Mr\n",
       "1       3  female  47.0      1      0   7.0000        S   Mrs\n",
       "2       2    male  62.0      0      0   9.6875        Q    Mr\n",
       "3       3    male  27.0      0      0   8.6625        S    Mr\n",
       "4       3  female  22.0      1      1  12.2875        S   Mrs"
      ]
     },
     "execution_count": 256,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 接下来要用哑变量对定类数据进行处理\n",
    "dummies_list = ['Sex', 'Embarked', 'Title']\n",
    "test = pd.get_dummies(test, columns=dummies_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 272,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>Title_Master</th>\n",
       "      <th>Title_Misc</th>\n",
       "      <th>Title_Miss</th>\n",
       "      <th>Title_Mr</th>\n",
       "      <th>Title_Mrs</th>\n",
       "      <th>Age_bin</th>\n",
       "      <th>Fare_bin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>34.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8292</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>47.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9.6875</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.6625</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>12.2875</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  Embarked_C  \\\n",
       "0       3  34.5      0      0   7.8292           0         1           0   \n",
       "1       3  47.0      1      0   7.0000           1         0           0   \n",
       "2       2  62.0      0      0   9.6875           0         1           0   \n",
       "3       3  27.0      0      0   8.6625           0         1           0   \n",
       "4       3  22.0      1      1  12.2875           1         0           0   \n",
       "\n",
       "   Embarked_Q  Embarked_S  Title_Master  Title_Misc  Title_Miss  Title_Mr  \\\n",
       "0           1           0             0           0           0         1   \n",
       "1           0           1             0           0           0         0   \n",
       "2           1           0             0           0           0         1   \n",
       "3           0           1             0           0           0         1   \n",
       "4           0           1             0           0           0         0   \n",
       "\n",
       "   Title_Mrs  Age_bin  Fare_bin  \n",
       "0          0        2         0  \n",
       "1          1        3         0  \n",
       "2          0        4         0  \n",
       "3          0        1         0  \n",
       "4          1        1         0  "
      ]
     },
     "execution_count": 272,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 接下来对连续数据离散化\n",
    "test['Age_bin'] = pd.cut(test.Age, 5)\n",
    "test['Fare_bin'] = pd.cut(test.Fare, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 270,
   "metadata": {},
   "outputs": [],
   "source": [
    "coder = LabelEncoder()\n",
    "test['Age_bin'] = coder.fit_transform(test['Age_bin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "metadata": {},
   "outputs": [],
   "source": [
    "test['Fare_bin'] = coder.fit_transform(test['Fare_bin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 274,
   "metadata": {},
   "outputs": [],
   "source": [
    "test3 = test.drop(['Age', 'Fare'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 277,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65155"
      ]
     },
     "execution_count": 277,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del test\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 发现之前为了测试label encode和 自己进行定序的区别，多创建了两个变量，现在进行删除\n",
    "train3.drop(['Age_code', 'Fare_code'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(891, 16)"
      ]
     },
     "execution_count": 286,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3.shape\n",
    "# 现在对的上了，test只比train多了一个标签属性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(418, 15)"
      ]
     },
     "execution_count": 287,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test3.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 开始stacking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 435,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 435,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 365,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ntrain:891\n",
      "ntest:418\n"
     ]
    }
   ],
   "source": [
    "NFOLDS = 5\n",
    "\n",
    "ntrain = train3.shape[0]\n",
    "print(f'ntrain:{ntrain}')\n",
    "ntest = test3.shape[0]\n",
    "print(f'ntest:{ntest}')\n",
    "kf = KFold(n_splits=NFOLDS, shuffle=True,random_state=0) # 这里只是实例化，还有split函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 357,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 为什么要定义一个sklearn助手呢？相比于直接实例化，有什么方便的地方吗？\n",
    "class SklearnHelper:\n",
    "    def __init__(self, clf, seed=0, params=None): # 初始化的参数，要放在clf的前面\n",
    "        params['random_state'] = seed\n",
    "        self.clf = clf(**params)\n",
    "        \n",
    "    def fit(self, x, y):\n",
    "        return self.clf.fit(x, y) # 为什么这个有输出呢\n",
    "    \n",
    "    def predict(self, x):\n",
    "        return self.clf.predict(x) # predict 本来就是输出预测结果的\n",
    "    \n",
    "    def train(self, x_train, y_train):\n",
    "        self.clf.fit(x_train, y_train) # 只是进行模型的训练，所以不需要输出\n",
    "    \n",
    "    def feature_importances(self, x, y):\n",
    "        print(self.clf.fit(x,y).feature_importances_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 371,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(712,)\n",
      "(179,)\n",
      "(713,)\n",
      "(178,)\n",
      "(713,)\n",
      "(178,)\n",
      "(713,)\n",
      "(178,)\n",
      "(713,)\n",
      "(178,)\n"
     ]
    }
   ],
   "source": [
    "# 这里使用的5折，712个数据作为训练集，179个数据作为测试集\n",
    "# 对于每个模型，先使用712个数据进行模型训练，178就是891/5左右\n",
    "# 进过5轮之后，每个训练集都会被当作测试集，并且有个输出\n",
    "# 这里虽然传入的是DF，但是返回的是DF的序号\n",
    "for i,j in kf.split(x_train3):\n",
    "    print(i.shape)\n",
    "    print(j.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 443,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.],\n",
       "       [0.]])"
      ]
     },
     "execution_count": 443,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.zeros((2,1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 445,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0., 0.])"
      ]
     },
     "execution_count": 445,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.zeros((2,))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 448,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.]])"
      ]
     },
     "execution_count": 448,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.zeros((2,3)).reshape(-1,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 467,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 1., 1.],\n",
       "       [1., 1., 1.],\n",
       "       [0., 0., 0.],\n",
       "       [0., 0., 0.],\n",
       "       [0., 0., 0.],\n",
       "       [0., 0., 0.],\n",
       "       [0., 0., 0.],\n",
       "       [0., 0., 0.]])"
      ]
     },
     "execution_count": 467,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = np.zeros((8,3))\n",
    "d[0,:] = 1\n",
    "d[1,:] = 1\n",
    "d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 468,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([0, 1, 3, 4, 5, 7]), array([2, 6]))\n",
      "(array([0, 2, 3, 4, 5, 6]), array([1, 7]))\n",
      "(array([1, 2, 4, 5, 6, 7]), array([0, 3]))\n",
      "(array([0, 1, 2, 3, 4, 6, 7]), array([5]))\n",
      "(array([0, 1, 2, 3, 5, 6, 7]), array([4]))\n"
     ]
    }
   ],
   "source": [
    "for i in kf.split(d):# kfold也可以对array类型的数据进行处理\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 454,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.66666667, 0.66666667, 0.66666667])"
      ]
     },
     "execution_count": 454,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d.mean(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 458,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.66666667],\n",
       "       [0.66666667],\n",
       "       [0.66666667]])"
      ]
     },
     "execution_count": 458,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d1 = np.zeros((3,))\n",
    "d1[:] = d.mean(axis=0) # 同样都是一维，两者可以直接赋值改变数值\n",
    "d1.reshape(-1,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 372,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 要弄清楚这个函数是干什么的，我们必须先知道stacking的原理是什么\n",
    "# stacking，堆叠，也就是很多模型叠加起来，可以堆叠很多层，这里使用两层\n",
    "# 第一层的结果是，五个模型的预测结果作为特征，放到另一个模型中输出结果\n",
    "# 所以这个函数要求输入模型，和数据集，输出预测结果，结果用ndarray形式表示\n",
    "# 函数的接口使用的是array类型\n",
    "\n",
    "NFOLDS = 5\n",
    "\n",
    "def get_off(clf, x_train, y_train, x_test):\n",
    "    of_train = np.zeros((ntrain,)) # 891\n",
    "    # 891, 是一维列向量，区别于891x1，表示二维 参考 https://zhidao.baidu.com/question/1372293838947031219.html\n",
    "    of_test = np.zeros((ntest,)) # 418,\n",
    "    of_test_skf = np.empty((NFOLDS, ntest)) # 5,418的零矩阵\n",
    "    \n",
    "    for i, (train_index, test_index) in enumerate(kf.split(x_train)): \n",
    "        x_tr = x_train[train_index] # 712 取出训练集的特征\n",
    "        y_tr = y_train[train_index] # 712 取出训练集的标签\n",
    "        x_te = x_train[test_index] # 179 取出测试集部分，没有标签\n",
    "        \n",
    "        clf.train(x_tr, y_tr) # clf传入的是实例化后的模型，训练每折的数据（训练）\n",
    "        of_train[test_index] = clf.predict(x_te) # 每次预测训练集的172左右数据，5折过后会存储891的全部数据（预测）\n",
    "        \n",
    "        of_test_skf[i,:] = clf.predict(x_test)# 同时也对真正的测试集进行预测\n",
    "    \n",
    "    # axis=0表示对每一列取平均值，可能有小数点\n",
    "    of_test[:] = of_test_skf.mean(axis=0) # 相当于交叉验证测试集，每折都输出预测，最后取平均，进行blending\n",
    "    return of_train.reshape(-1,1), of_test.reshape(-1,1) # 不管是3x3多少，全部转成列向量如9x1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 349,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Put in our parameters for said classifiers\n",
    "# Random Forest parameters\n",
    "rf_params = {\n",
    "    'n_jobs': -1,\n",
    "    'n_estimators': 500,\n",
    "    'warm_start': True, \n",
    "     #'max_features': 0.2,\n",
    "    'max_depth': 6,\n",
    "    'min_samples_leaf': 2,\n",
    "    'max_features' : 'sqrt',\n",
    "    'verbose': 0\n",
    "}\n",
    "\n",
    "# Extra Trees Parameters\n",
    "et_params = {\n",
    "    'n_jobs': -1,\n",
    "    'n_estimators':500,\n",
    "    #'max_features': 0.5,\n",
    "    'max_depth': 8,\n",
    "    'min_samples_leaf': 2,\n",
    "    'verbose': 0\n",
    "}\n",
    "\n",
    "# AdaBoost parameters\n",
    "ada_params = {\n",
    "    'n_estimators': 500,\n",
    "    'learning_rate' : 0.75\n",
    "}\n",
    "\n",
    "# Gradient Boosting parameters\n",
    "gb_params = {\n",
    "    'n_estimators': 500,\n",
    "     #'max_features': 0.2,\n",
    "    'max_depth': 5,\n",
    "    'min_samples_leaf': 2,\n",
    "    'verbose': 0\n",
    "}\n",
    "\n",
    "# Support Vector Classifier parameters \n",
    "svc_params = {\n",
    "    'kernel' : 'linear',\n",
    "    'C' : 0.025\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 350,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEED = 0\n",
    "\n",
    "rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)\n",
    "et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)\n",
    "ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)\n",
    "gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)\n",
    "svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 462,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ravel和flatten都是将数据扁平化处理的函数,将pandas的Series转换成array类型\n",
    "# 参照 https://blog.csdn.net/liuweiyuxiang/article/details/78220080\n",
    "# y_train[0] = 0\n",
    "# 两者的区别在于，ravel是浅拷贝，在y_train上修改，会影响到train3的值，但flatten不会\n",
    "y_train3 = train3['Survived'].ravel()# 一维array\n",
    "y_train3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 338,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取出标签，取出特征，准备输入到函数中，只不过都转换成了array类型\n",
    "train3 = train3.drop(['Survived'], axis=1)\n",
    "x_train3 = train3.values # 将DF数据转换成2维array数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 380,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_test = test3.values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**接下来要输出stacking的第一层，使用上面定义好的五个模型分类器**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 355,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(891, 15)"
      ]
     },
     "execution_count": 355,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 373,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Number of features of the model must match the input. Model n_features is 15 and input n_features is 17 ",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-373-8c45cfb1cc59>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# 这些输出将会作为下一层的新的特征\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mextra_tree_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextra_tree_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_off\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0met\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_train3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-372-03ed403ec8b2>\u001b[0m in \u001b[0;36mget_off\u001b[0;34m(clf, x_train, y_train, x_test)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m         \u001b[0mof_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtest_index\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_te\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m         \u001b[0mof_test_skf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m     \u001b[0mof_test\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mof_test_skf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-347-51b263ed74d0>\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# predict 本来就是输出预测结果的\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/jupyter env/lib64/python3.6/site-packages/sklearn/ensemble/_forest.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m    627\u001b[0m             \u001b[0mThe\u001b[0m \u001b[0mpredicted\u001b[0m \u001b[0mclasses\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    628\u001b[0m         \"\"\"\n\u001b[0;32m--> 629\u001b[0;31m         \u001b[0mproba\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    630\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    631\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_outputs_\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/jupyter env/lib64/python3.6/site-packages/sklearn/ensemble/_forest.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m    671\u001b[0m         \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    672\u001b[0m         \u001b[0;31m# Check data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 673\u001b[0;31m         \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    674\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    675\u001b[0m         \u001b[0;31m# Assign chunk of trees to jobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/jupyter env/lib64/python3.6/site-packages/sklearn/ensemble/_forest.py\u001b[0m in \u001b[0;36m_validate_X_predict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m    419\u001b[0m         \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    420\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 421\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimators_\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_input\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    422\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    423\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/jupyter env/lib64/python3.6/site-packages/sklearn/tree/_classes.py\u001b[0m in \u001b[0;36m_validate_X_predict\u001b[0;34m(self, X, check_input)\u001b[0m\n\u001b[1;32m    397\u001b[0m                              \u001b[0;34m\"match the input. Model n_features is %s and \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    398\u001b[0m                              \u001b[0;34m\"input n_features is %s \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 399\u001b[0;31m                              % (self.n_features_, n_features))\n\u001b[0m\u001b[1;32m    400\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    401\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Number of features of the model must match the input. Model n_features is 15 and input n_features is 17 "
     ]
    }
   ],
   "source": [
    "# 这些输出将会作为下一层的新的特征\n",
    "\n",
    "extra_tree_train, extra_tree_test = get_off(et, x_train3, y_train3, x_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "报错原因是train 和 test的特征不匹配"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 377,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(891, 15)"
      ]
     },
     "execution_count": 377,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 381,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(418, 15)"
      ]
     },
     "execution_count": 381,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 388,
   "metadata": {},
   "outputs": [],
   "source": [
    "extra_tree_train, extra_tree_test = get_off(et, x_train3, y_train3, x_test)# Extra Trees\n",
    "rf_train, rf_test = get_off(rf,x_train3, y_train3, x_test) # Random Forest\n",
    "ada_train, ada_test = get_off(ada, x_train3, y_train3, x_test) # AdaBoost \n",
    "gb_train, gb_test = get_off(gb,x_train3, y_train3, x_test) # Gradient Boost\n",
    "svc_train, svc_test = get_off(svc,x_train3, y_train3, x_test) # Support Vector Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 469,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 本来以为训练集没用了，我以为直接使用4个模型的预测结果，放到第二层就OK了\n",
    "# 这里之所以还需要train 和 test，是因为我们train和test特征纬度都要一样\n",
    "# 测试集418，4 训练集891，4\n",
    "# 我们使用第二层网络对训练集进行训练，然后对测试集进行预测，直接输出结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 386,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(418, 1)\n",
      "(891, 1)\n"
     ]
    }
   ],
   "source": [
    "print(extra_tree_test.shape)\n",
    "print(extra_tree_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 395,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(891,)"
      ]
     },
     "execution_count": 395,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print(extra_tree_train)\n",
    "# 本来是列向量的，转换成了ndarray\n",
    "extra_tree_train.ravel().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 399,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 开始训练第二层网络\n",
    "second_layer_train = pd.DataFrame({\n",
    "    'Exrea_Trees':extra_tree_train.ravel(),\n",
    "    'Random_Forest':rf_train.ravel(),\n",
    "    'AdaBoost':ada_train.ravel(),\n",
    "    'Gradient_Boost':gb_train.ravel(),\n",
    "#     'SVM':svc_train.ravel(),\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 400,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Exrea_Trees</th>\n",
       "      <th>Random_Forest</th>\n",
       "      <th>AdaBoost</th>\n",
       "      <th>Gradient_Boost</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Exrea_Trees  Random_Forest  AdaBoost  Gradient_Boost\n",
       "0          0.0            0.0       0.0             0.0\n",
       "1          1.0            1.0       1.0             1.0\n",
       "2          0.0            1.0       1.0             0.0\n",
       "3          1.0            1.0       1.0             1.0\n",
       "4          0.0            0.0       0.0             0.0"
      ]
     },
     "execution_count": 400,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "second_layer_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 模型的相关性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 470,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 模型融合，怎么知道哪些模型融合之后效果比较好\n",
    "# 可以将每个模型的预测结果组成为一个DF表\n",
    "# 然后查看这些特征的相关性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 403,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Exrea_Trees</th>\n",
       "      <th>Random_Forest</th>\n",
       "      <th>AdaBoost</th>\n",
       "      <th>Gradient_Boost</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Exrea_Trees</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.938193</td>\n",
       "      <td>0.878435</td>\n",
       "      <td>0.747343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Random_Forest</th>\n",
       "      <td>0.938193</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.889175</td>\n",
       "      <td>0.750308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AdaBoost</th>\n",
       "      <td>0.878435</td>\n",
       "      <td>0.889175</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.718777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Gradient_Boost</th>\n",
       "      <td>0.747343</td>\n",
       "      <td>0.750308</td>\n",
       "      <td>0.718777</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                Exrea_Trees  Random_Forest  AdaBoost  Gradient_Boost\n",
       "Exrea_Trees        1.000000       0.938193  0.878435        0.747343\n",
       "Random_Forest      0.938193       1.000000  0.889175        0.750308\n",
       "AdaBoost           0.878435       0.889175  1.000000        0.718777\n",
       "Gradient_Boost     0.747343       0.750308  0.718777        1.000000"
      ]
     },
     "execution_count": 403,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "second_layer_train.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 404,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:>"
      ]
     },
     "execution_count": 404,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaYAAAE/CAYAAAANGGeIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAoO0lEQVR4nO3de5wdVZnu8d+TCBK5K4hCIIAHFVQIdxHlqoCMAjogIKiAIzoj4EGZox75gAPHC95mRkQclKsioAJOnGFE5CLKNZCEQLgo4KgJKmC4CQhJ93P+qNWTTZvu3p00XdW7ni+f+nTtVbWr3u40++216q1Vsk1ERERTTKo7gIiIiE5JTBER0ShJTBER0ShJTBER0ShJTBER0ShJTBER0ShJTBERMSxJZ0l6UNIdQ2yXpK9KulfSXElbdWx7n6RfleV93ZwviSkiIkZyDrDXMNvfCmxSliOB0wEkvRg4Edge2A44UdKaI50siSkiIoZl+1pg4TC77Auc58qNwBqSXg7sCVxhe6HtR4ArGD7BAUlMERGx/NYDftfxen5pG6p9WC8Y09BiWIsevj/zPwGLLvpK3SE0xkNn3l13CI2x5/zH6g6hUe5+cKaW5/2j+bxZce1XfJBqCG7AGbbPWJ7zL48kpoiIXtTf1/WuJQktTyJaAKzf8XpqaVsA7DKo/ZqRDpahvIiIXuT+7pflNwN4b6nOez3wmO3fA5cDe0hasxQ97FHahpUeU0REL+ofk4QDgKQLqHo+a0maT1VptwKA7W8AlwF7A/cCTwGHl20LJZ0MzCyHOsn2cEUUQBJTRERPct/isTuWffAI2w18eIhtZwFnjeZ8SUwREb1obIboapHEFBHRi0ZR/NA0SUwREb0oPaaIiGiUMSx+GG9JTBERPcjpMUVERKOMYVXeeEtiiojoRSl+iIiIRslQXkRENEqKHyIiolHSY4qIiEZJjykiIprE/YvqDmGZJTFFRPSi9JgiIqJRco0pIiIaJfcxRUREo6THFBERjZIpiSIiolFS/BAREY2SxDQySX3A7R1NF9r+/Die/1JgI2AVYG3g12XTP9i+frziiIgYD3aKH7rxtO3py/JGSZO9nD9l2+8ox9oFOM722wad4wW2J+6gbEREpwncY5pU58klrS7pHkmvKq8vkPSBsv5nSV+WdBuwg6RDJd0saY6kf5M0uex3uqRbJM2T9E+jPP9hkmZIugq4UtLKks4q55ktad+y32RJX5Q0U9JcSR8s7S+XdG2J6Q5JbxrLn09ExDJzf/dLw4xnYppSPsAHlgNtPwYcBZwj6SBgTdvfLPuvDNxkewvgT8CBwI6l19UHHFL2+5TtbYDNgZ0lbT7KuLYC9re9M/Ap4Crb2wG7Al+UtDLwfuAx29sC2wIfkLQR8G7g8hLTFsCcwQeXdGRJnLd867wLRhlaRMQy6lvc/dIwtQ/l2b5C0gHAaVQf7gP6gIvL+u7A1sBMSQBTgAfLtndJOpLqe3k5sBkwdxRxXWF7YVnfA9hH0nHl9UrABqV9c0n7l/bVgU2AmcBZklYAfmh7zlK+vzOAMwAWPXy/RxFXRMSym8BDebVX5UmaBGwKPAWsCcwvm/7ScV1JwLm2PznovRsBxwHb2n5E0jlUyWQ0nuw8JPC3tu8ZdB4BR9u+fCnx7wT8DVWv7yu2zxvl+SMixl4Dh+i6Ves1puJY4C6qYbGzS+9jsCuB/SW9FEDSiyVNA1ajSiyPSVoHeOtyxnI5cHRJREjasqP97wdik/TKcj1qGvDHMvz4LaphwYiI+vX3d780zHj2mKZImtPx+sfA2cDfAdvZfkLStcDxwImdb7R9p6TjgZ+UHtYi4MO2b5Q0G7gb+B1w3XLGeDLwL8Dccp5fA2+jSjobArNK0noI2A/YBfhHSYuAPwPvXc7zR0SMjQYmnG7JzmWP8ZJrTJVFF32l7hAa46Ez7647hMbYc/5jdYfQKHc/OFPL8/6n/+MrXX/eTHnbR5frXGOt9mtMERHxPGhgtV23ejIxdczy0OnjSyteiIjoSRN4KK8nE9PALA8REa01xlV5kvYC/hWYDHxr8JRypRjsLKop3xYCh9qeX7Z1Tkn3W9v7DHeunkxMERGtN4Y9pjLTzmnAW6hu6ZkpaYbtOzt2+xJwnu1zJe0GfA54T9k2qinpmlAuHhERY21sy8W3A+61fb/tZ4ELgX0H7bMZcFVZv3op27uWxBQR0Yv6+rpeOqdOK8uRg462HtUtOQPml7ZOtwHvLOvvAFaV9JLyeqVy3Bsl7TdS6BnKi4joRaMYyuucOm05HAd8TdJhwLXAAqqp5QCm2V4gaWPgKkm3275vqAMlMUVE9KKxLX5YAKzf8XpqaVtyOvsBSo9J0ipU07s9WrYtKF/vl3QNsCUwZGLKUF5ERC8a22tMM4FNJG0kaUXgIGBG5w6S1ioz5gB8kqpCD0lrSnrhwD7AjkBn0cRfSWKKiOhFdvfLiIfyYqpHFF1ONbfp92zPk3SSpIHS712AeyT9ElgH+Exp3xS4pTxb72rg84Oq+f5KhvIiInrRGN9ga/sy4LJBbSd0rP8A+MFS3nc98LrRnCuJKSKiF2VKooiIaBL3T9w5o5OYIiJ6UebKi4iIRpnAT7BNYoqI6EUZyouIiEZZnOKHiIhokgn8dPIkpoiIXpTih4iIaJRcY4qIiEZJVV50Y9FFX6k7hEZY4cCP1h1CY0y55PC6Q2iMyQsydeeYSo8pIiKaxIv7Rt6poZKYIiJ6UYbyIiKiUTKUFxERjZJy8YiIaJT0mCIiolFyjSkiIpokVXkREdEsGcqLiIhGSWKKiIhGyTWmiIholPSYIiKiSbw4PaaIiGiS3GAbERGNkqG8iIholCSmiIhoEjuJKSIimiQ9poiIaJJU5UVERLNM4B7TpLoDiIiI50H/KJYuSNpL0j2S7pX0iaVsnybpSklzJV0jaWrHtvdJ+lVZ3jfSuZKYIiJ6kPvd9TISSZOB04C3ApsBB0vabNBuXwLOs705cBLwufLeFwMnAtsD2wEnSlpzuPMlMUVE9KJ+d7+MbDvgXtv3234WuBDYd9A+mwFXlfWrO7bvCVxhe6HtR4ArgL2GO1kSU0RELxrFUJ6kIyXd0rEcOeho6wG/63g9v7R1ug14Z1l/B7CqpJd0+d7nSPFDREQP8uLuix9snwGcsZynPA74mqTDgGuBBcAyPa1wxB6TpD5JcyTdIelHktZYlhMt5biHSfraWBxriOP/t6TbS+xzJL3heTrPdEl7Px/HjohYVmN5jYkqyazf8XpqaVtyPvsB2++0vSXwqdL2aDfvHaybobynbU+3/VpgIfDhLt7TFLuW2Kfbvr6bN0gabS9yOpDEFBHNMrZVeTOBTSRtJGlF4CBgRucOktaSNJBTPgmcVdYvB/aQtGYpetijtA1ptNeYbqCMDUraTtINkmZLul7Sq0r7YZIukfTjUhr4hY7AD5f0S0k3Azt2tG8o6apSZnilpA1K+zmSTpd0o6T7Je0i6SxJd0k6Z5Sxj3Seb0i6CfiCpFeU+G+V9HNJry77HVB6jrdJurb8A50EHFh6ZQcu5Zz/M3Z71i/mjTbkiIhl4v7ulxGPZS8GjqJKKHcB37M9T9JJkvYpu+0C3CPpl8A6wGfKexcCJ1Mlt5nASaVtSF33Dkq54O7AmaXpbuBNthdLejPwWeBvy7bpwJbAMyXQU4HFwD8BWwOPUVVtzC77nwqca/tcSUcAXwX2K9vWBHYA9qHK0DsCfwfMlDTd9pxhwr5aUh/wjO3tRzjPVOANtvskXQl8yPavJG0PfB3YDTgB2NP2Aklr2H5W0gnANraPWloAnWO3T5121MS94y0iJpYxnvjB9mXAZYPaTuhY/wHwgyHeexZLelAj6iYxTZE0h6qndBdVqR/A6sC5kjYBDKzQ8Z4rbT8GIOlOYBqwFnCN7YdK+0XAK8v+O7CkmuPbwBc6jvUj25Z0O/BH27eX988DNgTmDBP7rrYf7ng93Hm+X5LSKsAbgO9LGtj2wvL1OuAcSd8DLhnmvBERtfLiuiNYdl1fY6JKLmLJNaaTgavLtae3Ayt1vOeZjvU+lq/6b+BY/YOO27+cxx3syfJ1EvBox7Wp6bY3BbD9IeB4qgt5t5ZSyIiIxhnLobzx1vU1JttPAccAHysFAquzpLLisC4OcROws6SXSFoBOKBj2/VUF9MADgF+3m1cozTieWw/Dvxa0gEAqmxR1l9h+6bSfX2IKkE9Aaz6PMUbEbFMWpGYAGzPBuYCB1MNg31O0my66LnY/j3waaoCiuuohgUHHA0cLmku8B7gI6OJaxS6Pc8hwPsl3QbMY8kdzF8sJeh3UCW526iulW02VPFDREQdJnJi0kR+mNREk+KHygoHfrTuEBrjkQMPrzuExtj1jr/UHUKjzPvjTRp5r6H9cZdduv68Weeaa5brXGMtMz9ERPSgJvaEujXhE1O59+iFg5rfM1C9FxHRRv2LG9UJGpUJn5jK/UkREdHBTmKKiIgGyVBeREQ0ivvTY4qIiAaZyAXXSUwRET0oPaaIiGiU/r4kpoiIaJD0mCIiolFSLh4REY2ScvGIiGiU/vSYIiKiSfr7RvXwiEZJYoqI6EG5jykiIholVXkREdEoucYUERGNknLxiIholFxjioiIRunrT1VeREQ0SHpM0ZWHzry77hAaYcolh9cdQmOsedHZdYfQGPes+6a6Q+gpKX6IiIhGSfFDREQ0SnpMERHRKBP4EhMTt2wjIiKG1Nc/qeulG5L2knSPpHslfWIp2zeQdLWk2ZLmStq7tG8o6WlJc8ryjZHOlR5TREQPGsunXkiaDJwGvAWYD8yUNMP2nR27HQ98z/bpkjYDLgM2LNvusz292/OlxxQR0YOMul66sB1wr+37bT8LXAjs+1enhNXK+urAA8saexJTREQP6nf3SxfWA37X8Xp+aev0aeBQSfOpektHd2zbqAzx/UzSiPcFJDFFRPSgftT1IulISbd0LEcuwykPBs6xPRXYG/i2pEnA74ENbG8JfBT4rqTVhjlOrjFFRPSivu6G6ACwfQZwxjC7LADW73g9tbR1ej+wVzneDZJWAtay/SDwTGm/VdJ9wCuBW4Y6WXpMERE9aIyvMc0ENpG0kaQVgYOAGYP2+S2wO4CkTYGVgIckrV2KJ5C0MbAJcP9wJ0uPKSKiB41lVZ7txZKOAi4HJgNn2Z4n6STgFtszgI8B35R0LFUhxGG2LWkn4CRJi0pYH7K9cLjzJTFFRPSgsUxMALYvoypq6Gw7oWP9TmDHpbzvYuDi0ZwriSkiogd1OUTXSElMERE9qH/i5qUkpoiIXjSaqrymSWKKiOhBY32NaTwlMUVE9KB+pccUERENMpEfe5HEFBHRgzKUFxERjbI4Q3kREdEkGcqLiIhGyX1MERHRKBP5GlOjZxeXtJ8kS3r1ENuvkbTNCMe4pjynfo6ku5bxOSPDHf8wSeuO5TEjIpaXR7E0TaMTE9WDp35Rvi6PQ8rz5ncETinTto+Vw4AkpoholH51vzRNYxOTpFWAN1I9fOqg0jZF0oWl53MpMKVj/9PLkxfnSfqnIQ67CvAk0Ffec7Ck2yXdIemUjmP9VbukyZLOKW23SzpW0v7ANsD5pUc2ZWknjYgYb4tHsTRNk68x7Qv82PYvJf1J0tbAzsBTtjeVtDkwq2P/T9leWB5IdaWkzW3PLdvOl/QM1QOq/rftvjL8dgqwNfAI8BNJ+wE3D9H+O2A9268FkLSG7UfLM0qOsz3k0xgjIsabG9gT6lZje0xUw3cXlvULy+udgO8AlKQzt2P/d0maBcwGXgNs1rHtENubAxsAx0maBmwLXGP7IduLgfPL8Ydqvx/YWNKpkvYCHu/mm5B0ZOnJ3fLdhwc/iTgi4vnRP4qlaRrZY5L0YmA34HWSTPXERFMlnaXtvxFwHLCt7UcknUP1WN/nsP1QSV7bU55B361y3C2APYEPAe8CjujifWcAZwD8Zqs3N/E6Y0T0oCYmnG41tce0P/Bt29Nsb2h7feDXwK3AuwEkvRbYvOy/GtW1o8ckrQO8dWkHlfQiYEvgPqohu50lrVWG/w4GfjZUu6S1gEnlaYzHA1uVwz4BrDq2335ExPKZyFV5jewxUSWDUwa1XUyVVKZIugu4iypRYfs2SbOBu6muBV036L3nS3oaeCFwju1bASR9ArgaEPCftv99qPbSWzpb0kAy/2T5eg7wjXL8HWw/PRY/gIiI5dHEartuNTIx2d51KW1fHeE9hw3Rvssw77kAuKCbdtu3saSX1Nk+6ufZR0Q835pYbdetRiamiIhYPk0coutWElNERA/KUF5ERDTKRK7KS2KKiOhBGcqLiIhGWTyBU1MSU0RED5q4aSmJKSKiJ+UaU0RENEqq8iIiolH6J/BgXhJTREQPmrhpqbmTuEZExHJYjLteuiFpL0n3SLq3zCc6ePsGkq6WNFvSXEl7d2z7ZHnfPZL2HOlc6TFFRPSgsewxlSctnAa8BZgPzJQ0w/adHbsdD3zP9umSNgMuAzYs6wdRPSdvXeCnkl5pu2+o86XHFBHRg8b4QYHbAffavt/2s1QPb9130D6megQRwOrAA2V9X+BC28/Y/jVwbznekJKYIiJ6UD/ueul80nZZjhx0uPWoHik0YH5p6/Rp4FBJ86l6S0eP4r3PkaG8iIgeNJqhvM4nbS+Hg6med/dlSTsA3y4PdB21JKaIiB40xjfYLgDW73g9tbR1ej+wF4DtGyStBKzV5XufI0N5ERE9qA93vXRhJrCJpI0krUhVzDBj0D6/BXYHkLQpsBLwUNnvIEkvlLQRsAlw83AnS48pIqIHjeUNtrYXSzoKuByYDJxle56kk4BbbM8APgZ8U9KxVCOJh9k2ME/S94A7qR6s++HhKvIgiSkioieN9Q22ti+jKmrobDuhY/1OYMch3vsZ4DPdniuJKSKiB2VKooiIaJTMLh5d2XP+Y3WH0AiTF6TmZsA9676p7hAa4+kHfl53CD2ly6KGRkpiiojoQU5iioiIJslQXkRENEq/02OKiIgGmbhpKYkpIqInpVw8IiIaJVV5ERHRKOkxRUREo6RcPCIiGiXl4hER0ShOuXhERDRJrjFFRESjpCovIiIaJT2miIholFxjioiIRklVXkRENEruY4qIiEbp88TtMyUxRUT0oBQ/REREo2QoLyIiGiUPCoyIiEaZuGkpiSkioiflGlNERDRKqvIiIqJRJnKPaVI3O0laR9J3Jd0v6VZJN0h6x7KeVNKnJR1X1k+S9OZlPM50SXuPsM9hkh6SNEfSPEk/kPSiZTnfssYQETHePIr/mmbExCRJwA+Ba21vbHtr4CBg6qD9lqn3ZfsE2z9dlvcC04FuksJFtqfbfg3wLHDgMp5veWKIiBg3trtemqabHtNuwLO2vzHQYPs3tk8tvZEZkq4CrpS0iqQrJc2SdLukfQfeI+lTkn4p6RfAqzraz5G0f1nfWtLPSq/sckkvL+3XSDpF0s3lGG+StCJwEnBg6Q2NmGxK8lwZeKS83lDSVZLmlrg3GKH9AEl3SLpN0rXLEkNExHjox10v3ZC0l6R7JN0r6RNL2f7P5XNwTvmcfrRjW1/HthkjnaubXs5rgFnDbN8K2Nz2wvLB/w7bj0taC7ixBLEVVS9rejnnLODWQd/UCsCpwL62Hyof8p8BjhiI1fZ2ZdjsRNtvlnQCsI3to0b4Hg6U9Ebg5cAvgR+V9lOBc22fK+kI4KvAfsO0nwDsaXuBpDVsPztSDJKOBI4EWGeVaawxZe0RQo2IWH5jWfwgaTJwGvAWYD4wU9IM23cO7GP72I79jwa27DjE07and3u+rq4xDQrwtNJjmFmarrC9cGAz8FlJc4GfAusB6wBvAi61/ZTtx4GlZcxXAa8FrpA0Bzie5w4XXlK+3gpsOMqwLyo/lJcBtwP/WNp3AL5b1r8NvHGE9uuAcyR9AJjczYltn2F7G9vbJClFxHgZ42tM2wH32r7f9rPAhcC+w+x/MHDBssbeTWKaR9XjAcD2h4HdgYFP2Sc79j2ktG9dEsEfgZW6jEXAvHItaLrt19neo2P7M+VrH8tYTehqMPVHwE7L+P4PUSXM9YFbJb1kWY4TEfF867e7XrqwHvC7jtfzS9tfkTQN2Ai4qqN5JUm3SLpR0n4jnaybxHRVOejfd7QNVdW2OvCg7UWSdgWmlfZrgf0kTZG0KvD2pbz3HmBtSTtANbQn6TUjxPYEsGoX30OnNwL3lfXrqYYYoUqqPx+uXdIrbN9k+wTgIaoEtSwxREQ8r0bTY5J0ZEkcA8uRy3Hqg4Af2O7raJtmexvg3cC/SHrFcAcYMTGVXsZ+wM6Sfi3pZuBc4ONL2f18YBtJtwPvBe4ux5gFXATcBvwXMHPwG0v3cH/gFEm3AXOAN4wQ3tXAZl0UHgwUJ8ylGvc8ubQfDRxe2t8DfGSE9i+Woo47qJLXbaOIISJi3Iymx9R5yaEsZww63AKqP8QHTC1tS3MQg4bxbC8oX+8HruG515/+ippYKtirXv3SbfPDBiZr1Jc2e9Y9j8yvO4TGePqBn4+8U4ussNbGWp73j+bz5u4HZw57rlLY9kuqyzgLqDoX77Y9b9B+rwZ+DGxUOjVIWhN4yvYzpSjuBqoitzsZQmZ+iIjoQWNZlWd7saSjgMupCr/Osj1P0knALbYHCtoOAi70c3s8mwL/JqmfapTu88MlJeihxCTpcJYMuQ24rhRrRES0ylg/9sL2ZcBlg9pOGPT600t53/XA60Zzrp5JTLbPBs6uO46IiCZo4lRD3eqZxBQREUs4s4tHRESTTOTZxZOYIiJ60ESuuE5iiojoQXlQYERENMpYV+WNpySmiIgelKq8iIholFxjioiIRklVXkRENEpff4ofIiKiQTKUFxERjZKhvIiIaJT0mCIiolFyH1NERDRK7mOKiIhGSVVeREQ0SnpMERHRKCl+iIiIRpnIiUkTOfgYPUlH2j6j7jiaID+LJfKzWCI/i/pNqjuAGHdH1h1Ag+RnsUR+FkvkZ1GzJKaIiGiUJKaIiGiUJKb2ydj5EvlZLJGfxRL5WdQsxQ8REdEo6TFFRESjJDFFRESjJDFFRESjJDFF60g6oJu2Xibphd20tUV+J5oliaklJO0oaeWyfqikr0iaVndcNflkl2297IYu29oivxMNkrny2uN0YAtJWwAfA74FnAfsXGtU40jSW4G9gfUkfbVj02rA4nqiGl+SXgasB0yRtCWgsmk14EW1BVaT/E40UxJTeyy2bUn7Al+zfaak99cd1Dh7ALgF2Ae4taP9CeDYWiIaf3sChwFTgS+zJDE9AfzfmmKqU34nGij3MbWEpJ8BPwaOAN4EPAjcZvt1tQZWA0kr2F5U1tcE1rc9t+awxpWkv7V9cd1xNEV+J5ol15ja40DgGeAI23+g+ov5i/WGVJsrJK0m6cXALOCbkv657qDG2dTyM5Ckb0maJWmPuoOqUX4nGiSJqSVKMroYGKi8ehi4tL6IarW67ceBdwLn2d4e2L3mmMbbEeVnsAfwEuA9wOfrDalW+Z1okCSmlpD0AeAHwL+VpvWAH9YWUL1eIOnlwLuA/6g7mJoMXFvam+qDeF5HWxvld6JBkpja48PAjsDjALZ/Bby01ojqcxJwOXCf7ZmSNgZ+VXNM4+1WST+hSkyXS1oV6K85pjrld6JBUvzQEpJusr29pNm2t5T0AmCW7c3rji3Gn6RJwHTgftuPSnoJsF4u+EcTpMfUHj+T9H+p7l95C/B94Ec1x1QLSa+UdKWkO8rrzSUdX3dc48l2P1UBzPGSvgS8oc1JSdJUSZdKerAsF0uaWndcbZXE1B6fAB4Cbgc+CFwGtOrDuMM3qe7qXwRQPpAPqjWicSbp88BHgDvLcoykz9YbVa3OBmYA65blR6UtapChvBaRNAXYwPY9dcdSJ0kzbW87MKxZ2ubYnl5zaONG0lxgeuk5IWkyMLutQ7tL+/dv2+9Ek6TH1BKS9gHmUN1ki6TpkmbUGlR9Hpb0CsAAkvYHfl9vSLVYo2N99bqCaIg/lTkkJ5flUOBPdQfVVpmSqD1OBLYDrgGwPUfSRrVGVJ8PUz0++9WSFgC/Bg6pN6Rx9zlgtqSrqcrEd6Ia7m2rI4BTgYGbaq8DDq8vnHZLYmqPRbYfk55zq0rrxnHLkNU/2H5zmW19ku0n6o5rvNm+QNI1wLal6ePlJuxWsv0bqvnyogEylNce8yS9G5gsaRNJpwLX1x3UeLPdB7yxrD/ZxqTUYVuqntJOLElQrZSqvGZJ8UNLSHoR8CmqKWigupnw/9n+S31R1UPS6VQzX3wfeHKg3fYltQU1zkpV3rbA+aXpYGCm7TbOMI6kK4DvAt8uTYcCh9h+S31RtVcSUwuU4auf2t617liaQNLSyoBt+4hxD6Ymqcp7rlTlNUuuMbWA7T5J/ZJWt/1Y3fHUzXYualfWABaW9VTlVZV4F5TXB5OqvNokMfU4Sa+3fSPwZ+D2MmTROXx1TG3B1aRcOziVau5AgJ8DH7E9v76oxl2q8p6rsyrPVNdf8wdMTTKU1+MkzbK9laT3LW277XPHO6a65XpCpcymPVD0cHObq/KiWZKYetxAYqo7jiZp+/WEMoFvn21LWh/YnmpW7dk1hzbuJK1E9RDNR6imIfpHqt7jfcDJth+uMbzWSmLqcZIeBa4darvt1t27IelKqnnQOq8nHG675x8MV57LdQrV0O7JVB/Es4AtgbNsn1JjeONO0veo5kxcGVgTuIMqQb2RqjjkbTWG11pJTD1O0q+Avxtqu+2fjWM4jSBpGtX1hB1Ycj3hGNu/rTWwcSBpHtWH7qrAXcA02w+X2wlm2n5NrQGOM0l32H5t6UXOt/2yjm232d6ixvBaK8UPve+JNiafpRkoBGn5Xf7P2n4EeETSvQNDVbafkvRszbHV4VkA24slPTBoW18N8QRJTG3w393sJOkttq94nmOp29eBrQAk3WB7h5rjqcMUSVtSzfqyYllXWVaqNbJ6TJX0Varvf2Cd8nq9+sJqtwzlBdCOIolBj7n4n/U2KeXhQ2rbTdhDVasOaGPVahOkxxQDNPIuE94kSWtS9RYG1v/n+7a9cMh39oi2JZ6RdJt4JJ1q++jnO56oJDHFgDZ0nVcHbmVJMprVsc3AxuMeUY0kvRbYjI4hPNvn1RdRo+048i4xVpKYojVsb9jNfpJeY3ve8xxOrSSdCOxClZguA94K/AJIYora5bEXMeC/6w6gQb498i4T3v7A7sAfytyBW5D58qIh0mNqkeGGbmy/s664GqgN19uett0vabGk1YAHgfXrDqrB2vA70RjpMbVEGbo5tSy7Al+gvffyjKQN19tukbQG8E2q626zgBtqjahGkg4Yoe1fxzGc1ku5eEtIup1quGa27S0krQN8p20Tl3ajDaXznSRtCKxme27dsdRlaf/mbfs9aJIM5bVHhm6617MzIEga8oNW0la2Zw21vRdJeiuwN7Bex821AKsBi+uJKpKY2mPw0M2faffQzebAhnT8PzDwaHXbr68prPHw5fJ1JWAb4Daq6yebA7dQzR/YJg9Qfd/7UP1/MeAJ4NhaIooM5bVR24duJJ1F9UE8D+gvzW17tPolwIm2by+vXwt82vb+9UZWD0kr2F5UdxxRSWJqCUkCDgE2tn2SpA2Al9m+uebQxp2kO21vVnccdZI0b/BM4ktrawtJOwKfBqZR9aJF9cdKq266booM5bXH16l6B7sBJ1ENVVzMkieYtskNkjazfWfdgdRorqRvAd8prw+hGtZrqzOphu5uJbOK1y6JqT22L49Ynw1g+xFJK9YdVE3Oo0pOfwCeYclfx5vXG9a4Ohz4e+AYqu//VmCjWiOq12O2/6vuIKKSxNQeiyRNptyjI2ltllxfaZszgfcAt9PSn4Htv0i6BlgXeBewBlUPuq2ulvRF4BKqP1YAaFuVYlMkMbXHV4FLgZdK+gzVlDTH1xtSbR6yPaPuIOog6ZVUj5I/GHgYuAgy6ziwffm6TUebqYa+Y5yl+KEFJE0CXg8spJofTcCVtu+qNbCaSPo6VQ/hRzz3r+NL6oppvEjqB34OvN/2vaXt/lzkjyZJj6kFyo21p5UH491ddzwNMIUqIe3R0WaqYZxe907gIKqhqx8DF5J54CgzoXwWWNf2WyVtBuxg+8yaQ2ul9JhaQtKXqG6ovcT5R289SSsD+1IN6e1GVRByqe2f1BpYTST9F3A28KkyZdcLqKbvel3NobVSJnFtjw8C3weekfS4pCckPV53UHWQNFXSpZIeLMvFkqbWHdd4sv2k7e/afjswFZgNfLzmsOq0lu3vUYphbC8mZeO1SWJqCdur2p5ke0Xbq5XXq9UdV03OBmZQVaStS3Wt6exaI6qR7Udsn2F797pjqdGTkl7CkqrV1wOP1RtSeyUxtYSk9w96Pbk8CqON1rZ9tu3FZTkHWLvuoKJWH6X6Y+UVkq6jGto8ut6Q2iuJqT12l3SZpJeXedFuBFatO6ia/EnSoSU5T5Z0KPCnuoOK+pT7lXYG3kA17P2ats4l2QQpfmgRSQcCpwFPAu+2fV3NIdVC0jSqBybuQDV0cz1wjO3f1hpYjDtJu9m+StJSn+DchlsImijl4i0haRPgI1R3928KvEfSbNtP1RvZ+LP9G/L03qjsDFwFvH0p29pyC0HjpMfUEpLuBj5s+8oy0/hHgSPaNJu0pFMZ5rHpto8Zx3AiYgjpMbXHdrYfh2q2UuDLkn5Uc0zj7ZbydUdgM8p0PMABQJtnGm8tSR8dbrvtr4xXLLFEih96nKT/A2D7cUkHDNp82PhHVB/b59o+l+ohgbvYPtX2qVTTNE2vNbioy6pl2YZqtvX1yvIhYMjH0MfzK0N5PU7SLNtbDV5f2uu2kHQP1XQzC8vrNYEbbb+q3siiLpKuBf7G9hPl9arAf9reqd7I2ilDeb1PQ6wv7XVbfB6YLelqqp/BTlRPL432Wgd4tuP1s6UtapDE1Ps8xPrSXreC7bPL3GgDjzr4uO0/1BlT1O484GZJl5bX+wHn1hdOu2Uor8dJ6qO6b0lUs2oPlIcLWMn2CnXFVidJ6wHT6PjjzPa19UUUdZO0NfDG8vJa27PrjKfNkpiidSSdAhwIzGPJE2xtO/c2tZyklwIrDbzOTdf1SGKK1inFD5vbfmbEnaMVJO0DfJlqUt8HgQ2Au9t0n1+TpFw82uh+oJVDmDGkk6me8vxL2xsBb6aaTzJqkOKHaKOngDmSruS5j1bPzA/ttcj2nyRNkjTJ9tWS/qXuoNoqiSnaaEZZIgY8KmkV4FrgfEkPUhUNRQ1yjSkiWq88av5pqssbhwCrA+fbzuNQapDEFK1TZlr/HNV8eZ0VWBvXFlTURtJk4Ke2d607lqik+CHa6GzgdGAxsCvVzZXfqTWiqI3tPqBf0up1xxKV9JiidSTdantrSbfbfl1nW92xRT0k/TuwJXAFHdeWUhBTjxQ/RBs9I2kS8CtJRwELgFVqjinqdQlLHgo48Nd6W+eSrF0SU7TRR4AXAcdQ3b+yG/DeWiOKWkjaF5hq+7Ty+mZgbark9PE6Y2uzDOVF65WL3wfZPr/uWGJ8SbqO6t/+d+X1HKo/VFYBzra9e43htVaKH6I1JK0m6ZOSviZpD1WOAu4F3lV3fFGLFQeSUvEL2wvLHHkr1xVU26XHFK1RLnA/AtxA9dTal1JdR/iI7Tk1hhY1kXSv7f81xLb7bL9ivGOKXGOKdtm4owrvW8DvgQ1s/6XesKJGN0n6gO1vdjZK+iBwc00xtV4SU7TJooEV232S5icptd6xwA8lvRuYVdq2Bl5I9bDAqEGG8qI1Oh6aCM99cKKonse0Wl2xRb0k7QYMPOJinu2r6oyn7ZKYIiKiUVKVFxERjZLEFBERjZLEFBERjZLEFBERjZLEFBERjfL/AVIXJpPRNvMFAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.heatmap(second_layer_train.corr())\n",
    "\n",
    "# 如何查看模型的相关性呢？如果模型的相关性越差，则模型融合的提升就越大"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 405,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = np.concatenate([extra_tree_train, rf_train, ada_train, gb_train], axis=1)\n",
    "x_test = np.concatenate([extra_tree_test, rf_test, ada_test, gb_test], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 408,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(891, 4)\n",
      "(418, 4)\n"
     ]
    }
   ],
   "source": [
    "print(x_train.shape)\n",
    "print(x_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 409,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第二层我们使用XGBoost来处理\n",
    "gbm = xgb.XGBClassifier(\n",
    "    #learning_rate = 0.02,\n",
    " n_estimators= 2000,\n",
    " max_depth= 4,\n",
    " min_child_weight= 2,\n",
    " #gamma=1,\n",
    " gamma=0.9,                        \n",
    " subsample=0.8,\n",
    " colsample_bytree=0.8,\n",
    " objective= 'binary:logistic',\n",
    " nthread= -1,\n",
    " scale_pos_weight=1).fit(x_train, y_train)\n",
    "predictions = gbm.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 417,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 模型融合到此结束，我们再从头好好理清下思路\n",
    "# predictions\n",
    "\n",
    "submission = pd.DataFrame({\n",
    "    'PassengerId':np.arange(predictions.shape[0]),\n",
    "    'Survived':predictions,\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 419,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple\n",
      "Collecting pandas\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/a7/f7/2adca20a7fa71b6a32f823bbd83992adeceab1d8bf72992bb7a55c69c19a/pandas-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (10.5 MB)\n",
      "\u001b[K     |████████████████████████████████| 10.5 MB 350 kB/s eta 0:00:01�▎          | 6.9 MB 430 kB/s eta 0:00:09\n",
      "\u001b[?25hRequirement already satisfied, skipping upgrade: pytz>=2017.2 in /root/jupyter env/lib/python3.6/site-packages (from pandas) (2020.1)\n",
      "Requirement already satisfied, skipping upgrade: numpy>=1.15.4 in /root/jupyter env/lib64/python3.6/site-packages (from pandas) (1.19.1)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.7.3 in /root/jupyter env/lib/python3.6/site-packages (from pandas) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.5 in /root/jupyter env/lib/python3.6/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n",
      "Installing collected packages: pandas\n",
      "  Attempting uninstall: pandas\n",
      "    Found existing installation: pandas 0.25.3\n",
      "    Uninstalling pandas-0.25.3:\n",
      "      Successfully uninstalled pandas-0.25.3\n",
      "\u001b[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.\n",
      "\n",
      "We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.\n",
      "\n",
      "civisml-extensions 0.2.1 requires pandas~=0.19; python_version >= \"3.5\", but you'll have pandas 1.1.0 which is incompatible.\u001b[0m\n",
      "Successfully installed pandas-1.1.0\n"
     ]
    }
   ],
   "source": [
    "!pip3 install -U pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 421,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission.to_csv(r'Submission.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 423,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  PassengerId  Survived\n",
       "0           0            0         0\n",
       "1           1            1         1\n",
       "2           2            2         0\n",
       "3           3            3         0\n",
       "4           4            4         1"
      ]
     },
     "execution_count": 423,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv('Submission.csv').head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 提交数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 424,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 注意列不能从0到418，应该从418开始\n",
    "# 一定要index=False\n",
    "submission.to_csv(r'Submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 431,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pd.read_csv('Submission.csv')\n",
    "# import pandas as pd\n",
    "# import os\n",
    "# import numpy as np\n",
    "\n",
    "# os.chdir('/Users/mikizhu/Desktop')\n",
    "# data = pd.read_csv('Submission.csv')\n",
    "\n",
    "# data.drop(['Unnamed: 0'], axis=1, inplace=True)\n",
    "# data.to_csv('Submission.csv', index=False)\n",
    "# # print(pd.read_csv('test_result.csv'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 内容补充"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 垃圾回收机制"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://www.jianshu.com/p/b6a20c812ce4\n",
    "\n",
    "https://www.cnblogs.com/franknihao/p/7326849.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在平时我们往往因为计算机内存不足而不能很好地运行程序，所以我们通常要释放内存\n",
    "\n",
    "python的垃圾回收机制就是。当一个对象没有被引用时。py就会自动释放这个数据的内存。但这样会牺牲一些效率\n",
    "\n",
    "使用py的gc模块，进行手动垃圾回收"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 1\n",
    "a = 2\n",
    "# 前面的1就会被释放，因为没有变量引用这个数\n",
    "a = []\n",
    "# 形成了循环引用，要用垃圾回收机制\n",
    "b = []\n",
    "a.append(b)\n",
    "b.append(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import gc\n",
    "\n",
    "# 返回循环引用释放掉的对象个数\n",
    "gc.collect()\n",
    "# 每次执行就会检查内存一次，将这些循环引用的对象释放"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KFold"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里我们单独创建一个数据集进行测试观察结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>tag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>xiaoming</th>\n",
       "      <td>1213</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>xiaohong</th>\n",
       "      <td>121</td>\n",
       "      <td>e</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zhangsan</th>\n",
       "      <td>123</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>lisi</th>\n",
       "      <td>311</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date tag\n",
       "xiaoming  1213   a\n",
       "xiaohong   121   e\n",
       "zhangsan   123   c\n",
       "lisi       311   d"
      ]
     },
     "execution_count": 200,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d1 = {\n",
    "    'xiaoming':1213,\n",
    "    'xiaohong':111,\n",
    "    'zhangsan':123,\n",
    "    'lisi':311,\n",
    "    'xiaohong':121\n",
    "}\n",
    "d2 = {\n",
    "    'xiaoming':'a',\n",
    "    'xiaohong':'b',\n",
    "    'zhangsan':'c',\n",
    "    'lisi':'d',\n",
    "    'xiaohong':'e'\n",
    "}\n",
    "temp1 = pd.Series(d1)\n",
    "temp2 = pd.Series(d2)\n",
    "temp_train = pd.DataFrame({'date':d1,'tag':d2})\n",
    "temp_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([2, 3]), array([0, 1]))\n",
      "(array([0, 1]), array([2, 3]))\n"
     ]
    }
   ],
   "source": [
    "kf = KFold(n_splits=2,shuffle=False,random_state=0)\n",
    "for i in kf.split(temp_train):\n",
    "    print(i)\n",
    "# 注意是将序号打乱，返回的是序号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([0, 3]), array([1, 2]))\n",
      "(array([1, 2]), array([0, 3]))\n"
     ]
    }
   ],
   "source": [
    "kf = KFold(n_splits=2,shuffle=True)\n",
    "for i in kf.split(temp_train):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "打乱其实非常有用的，可以避免数据的某些特征模型没有学习到\n",
    "\n",
    "可以看到传回的其实只是序列号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          date tag\n",
      "xiaohong   121   e\n",
      "lisi       311   d\n",
      "          date tag\n",
      "xiaoming  1213   a\n",
      "zhangsan   123   c\n",
      "********************\n",
      "          date tag\n",
      "xiaoming  1213   a\n",
      "zhangsan   123   c\n",
      "          date tag\n",
      "xiaohong   121   e\n",
      "lisi       311   d\n",
      "********************\n"
     ]
    }
   ],
   "source": [
    "kf = KFold(n_splits=2,shuffle=True)\n",
    "for i,j in kf.split(temp_train):\n",
    "    x1_train, x1_test = temp_train.iloc[i,:], temp_train.iloc[j,:]\n",
    "    print(x1_train)\n",
    "    print(x1_test)\n",
    "    print('*'*20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 交叉验证"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "其实在平时的模型训练中，我们通常就是将70%作为训练集，30%作为测试集，来训练模型。这样有个缺点就是：\n",
    "- 模型预测的结果好坏，很大层度上会受到数据集划分的影响\n",
    "\n",
    "由此提出了交叉验证的方式，最常用的就是k—fold交叉验证。假如将数据划分成5部分，每一部分都会作为测试集，其他部分作为训练集。\n",
    "\n",
    "因此最后会得到5个模型，[理论参考自知乎](https://zhuanlan.zhihu.com/p/24825503)\n",
    "\n",
    "**k一般设置成5-10，这样可以避免模型过拟合**\n",
    "\n",
    "**但是计算成本比较高，数据量大的时候几乎不能使用**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "****\n",
    "当k=数据集大小时，这时候叫做留1法。此时只有1个数据作为测试集，其他模型作为训练集"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### cross_val_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "cross_val_score的作用就是打分而已，假如数据集划分成5部分，那么会得到5个分数\n",
    "\n",
    "**作用，如果分差别比较大，那么说明模型的泛化能力比较差**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"\\nfrom sklearn.model_selection import cross_val_score\\n\\nclf = SVC(kernel='linear', C=1)\\nscores = cross_val_score(clf, x, y, cv=5) # x表示特征，y表示标签\\n\""
      ]
     },
     "execution_count": 222,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "clf = SVC(kernel='linear', C=1)\n",
    "scores = cross_val_score(clf, x, y, cv=5) # x表示特征，y表示标签\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nfrom sklearn.model_selection import ShuffleSplit\\nmy_cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)\\nscores = cross_val_score(clf, iris.data, iris.target, cv=my_cv)\\n'"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 上面的cv=5表示进行5次交叉验证，默认是不会打乱顺序的，我们可以自己定义cv\n",
    "\"\"\"\n",
    "from sklearn.model_selection import ShuffleSplit\n",
    "my_cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)\n",
    "scores = cross_val_score(clf, iris.data, iris.target, cv=my_cv)\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "参考 https://www.cnblogs.com/jiaxin359/p/8552800.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### cross_val_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"\\nfrom sklearn.model_selection import cross_val_predict\\n\\nclf = SVC(kernel='linear', C=1)\\nscores = cross_val_predict(clf, x, y, cv=5) # x表示特征，y表示标签\\n\""
      ]
     },
     "execution_count": 223,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "\n",
    "clf = SVC(kernel='linear', C=1)\n",
    "scores = cross_val_predict(clf, x, y, cv=5) # x表示特征，y表示标签\n",
    "\"\"\"\n",
    "\n",
    "# 假如数据分成5部分，前4部分作为训练集，5部分作为测试集，先会预测出第五部分的值，最后返回所有预测结果\n",
    "\n",
    "# 这就说明，有些部分可能会拟合的比较好，不知道模型的效果会不会有所提升"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 问题思考"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "其实像交叉验证，只是将数据集进行简单的划分，得到很多个得分而已，然而对实际的模型预测有什么实质性的帮助呢"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ShuffleSplit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "参考自 https://www.cnblogs.com/jiaxin359/p/8552800.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import ShuffleSplit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [],
   "source": [
    "sf = ShuffleSplit(n_splits=3, train_size=0.8, random_state=2020)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1, 3, 0]), array([2]))\n",
      "(array([3, 1, 2]), array([0]))\n",
      "(array([1, 0, 3]), array([2]))\n"
     ]
    }
   ],
   "source": [
    "for i in sf.split(temp_train):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可以看到训练集是会有重复的"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 在jupyter中使用shell"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "其实我们在jupyter中是可以使用shell命令的\n",
    "\n",
    "- 只要在前面加个感叹号即可"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "参考自 https://zhuanlan.zhihu.com/p/83628598"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## python中的解包"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "python中可以使用* and \\** 进行解包，双星号是解包字典，由于字典是无序的，所以在使用的时候，参数会在字典中进行匹配，然后炸开字典，把值一一传给参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 注意解包只能在函数参数传递中进行，不能使用在其他环境"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 327,
   "metadata": {},
   "outputs": [],
   "source": [
    "svc_params = {\n",
    "    'kernel' : 'linear',\n",
    "    'C' : 0.025\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 328,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "linear 0.025\n"
     ]
    }
   ],
   "source": [
    "class A:\n",
    "    def __init__(self, C, kernel):\n",
    "        print(kernel, C)\n",
    "        \n",
    "a = A(**svc_params)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 323,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "linear 0.025\n"
     ]
    }
   ],
   "source": [
    "def test(kernel, C):\n",
    "    print(kernel,C)\n",
    "\n",
    "test(**svc_params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 参考资料"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. 资料比较全，包括了所有机器学习部分的讲解，还有stacking和面试等知识，方便查阅\n",
    "https://www.jianshu.com/p/28f02bb59fe5\n",
    "2. https://www.cnblogs.com/jiaxin359/ 这个人写的比较详细，可以看看"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 待解决问题"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 在对连续型数据进行填充时，使用中位数填充还是平均数填充"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TODO"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "200px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
